Merge "libvpx: Pull from upstream" into mnc-dr-dev

commit: 0b489d9457b668909fb5121ffad12c8bbd03281d [log] [tgz]
author: Vignesh Venkatasubramanian <vigneshv@google.com> Tue Oct 20 04:07:27 2015 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> Tue Oct 20 04:07:27 2015 +0000
tree: 9a0513346e6332db05ed73f42998d53f69410106
parent: cc274e2abe8b2a6698a5c47d8aa4bb45f1f9538d [diff]
parent: 7ce0a1d1337c01056ba24006efab21f00e179e04 [diff]
diff --git a/README.android b/README.android
new file mode 100644
index 0000000..5949fc6
--- /dev/null
+++ b/README.android

@@ -0,0 +1,59 @@
+Name: libvpx
+URL: http://www.webmproject.org
+Version: v1.4.0
+License: BSD
+License File: libvpx/LICENSE
+
+Date: Tuesday August 25 2015
+Branch: origin/master
+Commit: 7105df53d7dc13d5e575bc8df714ec8d1da36b06
+
+Description:
+Contains the sources used to compile libvpx.
+
+The libvpx source is from webmproject.org:
+  https://chromium.googlesource.com/webm/libvpx
+
+Notes on updating libvpx source code:
+
+Please follow these steps to update libvpx source code:
+
+1. Update libvpx source tree. Look for "Current HEAD: <hash>" output to update
+   README.android. Look for "git log from upstream: <git log>" output to add
+   to the commit message.
+
+   ./update_libvpx.sh [branch name]
+
+2. Generate updated .gypi and config files.
+
+   ./generate_config.sh
+
+3. Update this file with any Version, Date, Branch, or Commit changes. The
+   version is in the file source/config/vpx_version.h
+
+4. Remove the following unwanted files/directories:
+   * external/libvpx/libvpx/vp10
+
+5. Commit the changes. The commit message should look like this and is printed
+   by update_libvpx.sh:
+   libvpx: Pull from upstream
+
+   Current HEAD: <hash>
+
+   git log from upstream:
+   a6b2070 <git commit message 1>
+   08dabbc <git commit message 2>
+   c29fb02 <git commit message 3>
+
+Tools needed to build libvpx:
+
+- generate_config.sh
+
+Generate config files that contain the source list for each platform.
+Configuration for the build is taken from vpx_config.h for each platform.
+
+- lint_config.sh
+
+A tool to verify vpx_config.h and vpx_config.asm are matched. This also
+prints the final configuration after checking.
+

diff --git a/UPDATING b/UPDATING
deleted file mode 100644
index ed5036c..0000000
--- a/UPDATING
+++ /dev/null

@@ -1,52 +0,0 @@
-Pull the new libvpx checkout into external/libvpx/libvpx:
-$ cd external/libvpx/
-$ rm -rf libvpx
-$ git clone http://git.chromium.org/webm/libvpx.git
-$ cd libvpx
-$ git checkout <branch>
-
-Enter the subdirectory for the relevant platform. For example, armv7-neon for
-armv7 targets with neon extensions. We disable many features. Some for
-functional reasons and some for aesthetic ones.
-
-Functional:
---force-target=$TARGET
-  The "Android" support in libvpx is targeted at the NDK and does not yet
-  include support for architectures such as MIPS. However, we can still generate
-  the necessary files for them.
---disable-runtime-cpu-detect
---disable-neon
-  The platform knows at build time what extensions are supported.
---sdk-path=$SDK_PATH
-  For configuration we do some compiler tests. It is much easier to accept them
-  than to work around them. This uses the compilers included in the NDK.
---enable-realtime-only
-  Reduce binary size when building the encoder.
-
-Aesthetic:
---disable-examples
---disable-docs
-  Skip unnecessary extra makefiles.
-
-Example:
-$ cd external/libvpx/armv7a
-$ ../libvpx/configure --target=armv7-android-gcc --disable-runtime-cpu-detect \
-  --disable-neon --sdk-path=$ANDROID_NDK_ROOT --disable-vp9-encoder \
-  --disable-examples --disable-docs
-
-Run 'make libvpx_srcs.txt'
-This will generate a file listing all of the required sources. It will also
-generate vpx_rtcd.h and vpx_version.h
-
-Remove the unused files leaving only:
-libvpx_srcs.txt
-vpx_config.c
-vpx_config.h
-vpx_scale_rtcd.h
-vp8_rtcd.h
-vp9_rtcd.h
-vpx_version.h
-
-Remove the .git* from external/libvpx/libvpx:
-$ cd external/libvpx/libvpx
-$ rm -rf .git*

diff --git a/armv7a-neon/vp8_rtcd.h b/armv7a-neon/vp8_rtcd.h
deleted file mode 100644
index 66ad28a..0000000
--- a/armv7a-neon/vp8_rtcd.h
+++ /dev/null

@@ -1,415 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_c
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
-void vp8_clear_system_state_c();
-#define vp8_clear_system_state vp8_clear_system_state_c
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_neon
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_v6(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_v6(struct blockd*, short *dqc);
-void vp8_dequantize_b_neon(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_neon
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-void vp8_fast_quantize_b_armv6(struct block *, struct blockd *);
-void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_neon
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-void vp8_fast_quantize_b_pair_neon(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_neon
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_neon
-
-unsigned int vp8_get_mb_ss_c(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_c
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_c
-
-int vp8_mbuverror_c(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_c
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_neon
-
-void vp8_quantize_mb_c(struct macroblock *);
-void vp8_quantize_mb_neon(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_neon
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-void vp8_quantize_mbuv_neon(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
-
-void vp8_quantize_mby_c(struct macroblock *);
-void vp8_quantize_mby_neon(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_neon
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_c
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x16_armv6(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_neon
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_neon
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad4x4_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_neon
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_neon
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_neon
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct4x4_armv6(short *input, short *output, int pitch);
-void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_v6(short *input, short *output);
-void vp8_short_inv_walsh4x4_neon(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-void vp8_short_walsh4x4_armv6(short *input, short *output, int pitch);
-void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_armv6
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x16_neon(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_neon
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x8_neon(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_neon
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-void vp8_subtract_b_armv6(struct block *be, struct blockd *bd, int pitch);
-void vp8_subtract_b_neon(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_neon
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-void vp8_subtract_mbuv_armv6(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_neon
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-void vp8_subtract_mby_armv6(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_neon
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_neon
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_neon
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_c
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_neon
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_neon
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_hv_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_neon
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_neon
-
-void vp8_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
deleted file mode 100644
index ff6a27e..0000000
--- a/armv7a-neon/vp9_rtcd.h
+++ /dev/null

@@ -1,755 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_neon
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_neon
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_neon
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_neon
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_neon
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_neon
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_neon
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_c
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_c
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_c
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_c
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_c
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_c
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct8x8_neon(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_neon
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_neon
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_c
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_neon
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_neon
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_c
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_neon
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_neon
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_neon
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_neon
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_neon
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_neon
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_neon
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_neon
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_neon
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_1024_add_neon
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_neon
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_neon
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_12_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_neon
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_neon
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_64_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_neon
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_neon
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_neon
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_neon
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_neon
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_neon
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_neon
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_neon
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_neon
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_neon
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_neon
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_neon
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_c
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_neon
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_neon
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_c
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_c
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_c
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_c
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_c
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_c
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_c
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_c
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_c
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_c
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_c
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_neon
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_c
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_c
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_c
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_c
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_c
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_c
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_c
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_c
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_c
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_c
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_c
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_c
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_c
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_c
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_neon
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_c
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_c
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_c
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_c
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_c
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_c
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_c
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_c
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_neon
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_c
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_c
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_neon
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_neon
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_neon
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-void vp9_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_neon
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_neon
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_neon
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_neon
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_neon
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_neon
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_neon
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_neon
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_neon
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_neon
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_c
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_c
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_c
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_neon
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_c
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_c
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_c
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_c
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_c
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_c
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_c
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_neon
-
-void vp9_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/armv7a-neon/vpx_config.c b/armv7a-neon/vpx_config.c
deleted file mode 100644
index 8c4d0fa..0000000
--- a/armv7a-neon/vpx_config.c
+++ /dev/null

@@ -1,9 +0,0 @@
-/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
-/*  */
-/* Use of this source code is governed by a BSD-style license */
-/* that can be found in the LICENSE file in the root of the source */
-/* tree. An additional intellectual property rights grant can be found */
-/* in the file PATENTS.  All contributing project authors may */
-/* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/vigneshv/Downloads/android-ndk-r10 --disable-examples --disable-docs --enable-realtime-only";
-const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/armv7a-neon/vpx_version.h b/armv7a-neon/vpx_version.h
deleted file mode 100644
index 59adf99..0000000
--- a/armv7a-neon/vpx_version.h
+++ /dev/null

@@ -1,7 +0,0 @@
-#define VERSION_MAJOR  1
-#define VERSION_MINOR  3
-#define VERSION_PATCH  0
-#define VERSION_EXTRA  "3825-gd4a47a6"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0-3825-gd4a47a6"
-#define VERSION_STRING      " v1.3.0-3825-gd4a47a6"

diff --git a/armv7a/vp8_rtcd.h b/armv7a/vp8_rtcd.h
deleted file mode 100644
index 8cc9c15..0000000
--- a/armv7a/vp8_rtcd.h
+++ /dev/null

@@ -1,361 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_armv6
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_armv6
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_armv6
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_c
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
-void vp8_clear_system_state_c();
-#define vp8_clear_system_state vp8_clear_system_state_c
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_v6
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_v6
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_v6
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_v6
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_c
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_v6(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_v6(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_v6
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-void vp8_fast_quantize_b_armv6(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_armv6
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c
-
-unsigned int vp8_get_mb_ss_c(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_c
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_armv6
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_armv6
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_armv6
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_armv6
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_armv6
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_armv6
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_armv6
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_armv6
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_c
-
-int vp8_mbuverror_c(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_c
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_armv6
-
-void vp8_quantize_mb_c(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_c
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_c
-
-void vp8_quantize_mby_c(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_c
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_c
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x16_armv6(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_armv6
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_c
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_c
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_c
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_c
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct4x4_armv6(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_armv6
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_armv6
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_v6_dual
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_v6(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_v6
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-void vp8_short_walsh4x4_armv6(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_armv6
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_armv6
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_armv6
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_armv6
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_armv6
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_armv6
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_armv6
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-void vp8_subtract_b_armv6(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_armv6
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-void vp8_subtract_mbuv_armv6(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_armv6
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-void vp8_subtract_mby_armv6(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_armv6
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_armv6
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_c
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_c
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_c
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_armv6
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_armv6
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
-void vp8_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h
deleted file mode 100644
index 0ebc52b..0000000
--- a/armv7a/vp9_rtcd.h
+++ /dev/null

@@ -1,695 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_c
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_c
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_c
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_c
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_c
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_c
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_c
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_c
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_c
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_c
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_c
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_c
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_c
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_c
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_c
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_c
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_c
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_c
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_c
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_c
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_c
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_c
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_c
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_c
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_c
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_c
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_c
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_c
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_c
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_c
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_c
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_c
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_c
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_c
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_c
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_c
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_c
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_c
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_c
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_c
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_c
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_c
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_c
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_c
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_c
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_c
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_c
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_c
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_c
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_c
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_c
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_c
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_c
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_c
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_c
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_c
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_c
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_c
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_c
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_c
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_c
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_c
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_c
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_c
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_c
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_c
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_c
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_c
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_c
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_c
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_c
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_c
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_c
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_c
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_c
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_c
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_c
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_c
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_c
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_c
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_c
-
-void vp9_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/armv7a/vpx_config.c b/armv7a/vpx_config.c
deleted file mode 100644
index 584958f..0000000
--- a/armv7a/vpx_config.c
+++ /dev/null

@@ -1,9 +0,0 @@
-/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
-/*  */
-/* Use of this source code is governed by a BSD-style license */
-/* that can be found in the LICENSE file in the root of the source */
-/* tree. An additional intellectual property rights grant can be found */
-/* in the file PATENTS.  All contributing project authors may */
-/* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/vigneshv/Downloads/android-ndk-r10 --disable-neon --disable-neon-asm --disable-examples --disable-docs --enable-realtime-only";
-const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h
deleted file mode 100644
index 0a6d790..0000000
--- a/armv7a/vpx_scale_rtcd.h
+++ /dev/null

@@ -1,71 +0,0 @@
-#ifndef VPX_SCALE_RTCD_H_
-#define VPX_SCALE_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct yv12_buffer_config;
-
-void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
-
-void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
-
-void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
-
-void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
-
-void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
-
-void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
-
-void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
-
-void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
-
-void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
-
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
-
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
-
-void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vpx_yv12_copy_y vpx_yv12_copy_y_c
-
-void vpx_scale_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/armv7a/vpx_version.h b/armv7a/vpx_version.h
deleted file mode 100644
index 59adf99..0000000
--- a/armv7a/vpx_version.h
+++ /dev/null

@@ -1,7 +0,0 @@
-#define VERSION_MAJOR  1
-#define VERSION_MINOR  3
-#define VERSION_PATCH  0
-#define VERSION_EXTRA  "3825-gd4a47a6"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0-3825-gd4a47a6"
-#define VERSION_STRING      " v1.3.0-3825-gd4a47a6"

diff --git a/config.arm.mk b/config.arm.mk
index 3f0f0b7..6ae58b8 100644
--- a/config.arm.mk
+++ b/config.arm.mk

@@ -4,13 +4,15 @@
 # libvpx_codec_srcs_asm_arm
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
-libvpx_target := armv7a-neon
+libvpx_target := config/arm-neon
 else ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
-libvpx_target := armv7a
+libvpx_target := config/arm
 else
-libvpx_target := generic
+libvpx_target := config/generic
 endif
 
+LOCAL_ARM_MODE := arm
+
 libvpx_config_dir_arm := $(LOCAL_PATH)/$(libvpx_target)
 libvpx_codec_srcs := $(sort $(shell cat $(libvpx_config_dir_arm)/libvpx_srcs.txt))
 
@@ -19,4 +21,4 @@
     $(filter %.c, $(libvpx_codec_srcs)))) \
     $(libvpx_target)/vpx_config.c
 
-libvpx_codec_srcs_asm_arm := $(filter %.asm.s, $(libvpx_codec_srcs))
+libvpx_codec_srcs_asm_arm := $(filter %.asm, $(libvpx_codec_srcs))

diff --git a/config.arm64.mk b/config.arm64.mk
index eb7e167..0740fd9 100644
--- a/config.arm64.mk
+++ b/config.arm64.mk

@@ -3,7 +3,9 @@
 # libvpx_codec_srcs_c_arm64
 # libvpx_codec_srcs_asm_arm64
 
-libvpx_target := generic
+libvpx_target := config/arm64
+
+LOCAL_ARM_MODE := arm
 
 libvpx_config_dir_arm64 := $(LOCAL_PATH)/$(libvpx_target)
 libvpx_codec_srcs := $(sort $(shell cat $(libvpx_config_dir_arm64)/libvpx_srcs.txt))

diff --git a/config.mips.mk b/config.mips.mk
index 842c0ab..f39e7b1 100644
--- a/config.mips.mk
+++ b/config.mips.mk

@@ -1,16 +1,15 @@
 # Output variables:
 # libvpx_config_dir_mips
 # libvpx_codec_srcs_c_mips
-# libvpx_codec_srcs_asm_mips
 
 ifneq ($(ARCH_HAS_BIGENDIAN),true)
   ifeq ($(ARCH_MIPS_DSP_REV),2)
-    libvpx_target := mips-dspr2
+    libvpx_target := config/mips32-dspr2
   else
-    libvpx_target := mips
+    libvpx_target := config/mips32
   endif
 else
-  libvpx_target := generic
+  libvpx_target := config/generic
 endif
 
 libvpx_config_dir_mips := $(LOCAL_PATH)/$(libvpx_target)
@@ -20,5 +19,3 @@
 libvpx_codec_srcs_c_mips := $(addprefix libvpx/, $(filter-out vpx_config.c, \
     $(filter %.c, $(libvpx_codec_srcs)))) \
     $(libvpx_target)/vpx_config.c
-
-libvpx_codec_srcs_asm_mips := $(filter %.asm.s, $(libvpx_codec_srcs))

diff --git a/config.mips64.mk b/config.mips64.mk
index fa1f1d2..5e6e3e8 100644
--- a/config.mips64.mk
+++ b/config.mips64.mk

@@ -1,9 +1,8 @@
 # Output variables:
 # libvpx_config_dir_mips64
 # libvpx_codec_srcs_c_mips64
-# libvpx_codec_srcs_asm_mips64
 
-libvpx_target := generic
+libvpx_target := config/generic
 
 libvpx_config_dir_mips64 := $(LOCAL_PATH)/$(libvpx_target)
 libvpx_codec_srcs := $(sort $(shell cat $(libvpx_config_dir_mips64)/libvpx_srcs.txt))
@@ -12,5 +11,3 @@
 libvpx_codec_srcs_c_mips64 := $(addprefix libvpx/, $(filter-out vpx_config.c, \
     $(filter %.c, $(libvpx_codec_srcs)))) \
     $(libvpx_target)/vpx_config.c
-
-libvpx_codec_srcs_asm_mips64 := $(filter %.asm.s, $(libvpx_codec_srcs))

diff --git a/config.x86.mk b/config.x86.mk
index 5d604b9..3ebb313 100644
--- a/config.x86.mk
+++ b/config.x86.mk

@@ -1,9 +1,8 @@
 # Output variables:
 # libvpx_config_dir_x86
 # libvpx_codec_srcs_c_x86
-# libvpx_codec_srcs_asm_x86
 
-libvpx_target := x86
+libvpx_target := config/x86
 
 libvpx_config_dir_x86 := $(LOCAL_PATH)/$(libvpx_target)
 libvpx_codec_srcs := $(sort $(shell cat $(libvpx_config_dir_x86)/libvpx_srcs.txt))

diff --git a/config.x86_64.mk b/config.x86_64.mk
index ed003d6..fb9ceda 100644
--- a/config.x86_64.mk
+++ b/config.x86_64.mk

@@ -3,7 +3,7 @@
 # libvpx_codec_srcs_c_x86_64
 # libvpx_codec_srcs_asm_x86_64
 
-libvpx_target := generic
+libvpx_target := config/generic
 
 libvpx_config_dir_x86_64 := $(LOCAL_PATH)/$(libvpx_target)
 libvpx_codec_srcs := $(sort $(shell cat $(libvpx_config_dir_x86_64)/libvpx_srcs.txt))
@@ -13,4 +13,5 @@
     $(filter %.c, $(libvpx_codec_srcs)))) \
     $(libvpx_target)/vpx_config.c
 
-libvpx_codec_srcs_asm_x86_64 := $(filter %.asm.s, $(libvpx_codec_srcs))
+# X86_64 asm files are processed by the system and sent to yasm
+libvpx_codec_srcs_c_x86_64 += $(addprefix libvpx/, $(filter %.asm, $(libvpx_codec_srcs)))

diff --git a/armv7a-neon/libvpx_srcs.txt b/config/arm-neon/libvpx_srcs.txt
similarity index 65%
rename from armv7a-neon/libvpx_srcs.txt
rename to config/arm-neon/libvpx_srcs.txt
index ea4fbb3..9d5084c 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/config/arm-neon/libvpx_srcs.txt

@@ -1,30 +1,24 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
-vp8/common/arm/armv6/bilinearfilter_v6.asm.s
-vp8/common/arm/armv6/copymem16x16_v6.asm.s
-vp8/common/arm/armv6/copymem8x4_v6.asm.s
-vp8/common/arm/armv6/copymem8x8_v6.asm.s
-vp8/common/arm/armv6/dc_only_idct_add_v6.asm.s
-vp8/common/arm/armv6/dequant_idct_v6.asm.s
-vp8/common/arm/armv6/dequantize_v6.asm.s
-vp8/common/arm/armv6/filter_v6.asm.s
+vp8/common/arm/armv6/bilinearfilter_v6.asm
+vp8/common/arm/armv6/copymem16x16_v6.asm
+vp8/common/arm/armv6/copymem8x4_v6.asm
+vp8/common/arm/armv6/copymem8x8_v6.asm
+vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+vp8/common/arm/armv6/dequant_idct_v6.asm
+vp8/common/arm/armv6/dequantize_v6.asm
+vp8/common/arm/armv6/filter_v6.asm
 vp8/common/arm/armv6/idct_blk_v6.c
-vp8/common/arm/armv6/idct_v6.asm.s
-vp8/common/arm/armv6/intra4x4_predict_v6.asm.s
-vp8/common/arm/armv6/iwalsh_v6.asm.s
-vp8/common/arm/armv6/loopfilter_v6.asm.s
-vp8/common/arm/armv6/simpleloopfilter_v6.asm.s
-vp8/common/arm/armv6/sixtappredict8x4_v6.asm.s
-vp8/common/arm/armv6/vp8_sad16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance8x8_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm.s
+vp8/common/arm/armv6/idct_v6.asm
+vp8/common/arm/armv6/intra4x4_predict_v6.asm
+vp8/common/arm/armv6/iwalsh_v6.asm
+vp8/common/arm/armv6/loopfilter_v6.asm
+vp8/common/arm/armv6/simpleloopfilter_v6.asm
+vp8/common/arm/armv6/sixtappredict8x4_v6.asm
 vp8/common/arm/bilinearfilter_arm.c
 vp8/common/arm/bilinearfilter_arm.h
 vp8/common/arm/dequantize_arm.c
@@ -36,25 +30,21 @@
 vp8/common/arm/neon/dequant_idct_neon.c
 vp8/common/arm/neon/dequantizeb_neon.c
 vp8/common/arm/neon/idct_blk_neon.c
-vp8/common/arm/neon/idct_dequant_0_2x_neon.asm.s
-vp8/common/arm/neon/idct_dequant_full_2x_neon.asm.s
+vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+vp8/common/arm/neon/idct_dequant_full_2x_neon.c
 vp8/common/arm/neon/iwalsh_neon.c
-vp8/common/arm/neon/loopfilter_neon.asm.s
 vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
-vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm.s
+vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
 vp8/common/arm/neon/mbloopfilter_neon.c
-vp8/common/arm/neon/sad_neon.c
+vp8/common/arm/neon/reconintra_neon.c
 vp8/common/arm/neon/shortidct4x4llm_neon.c
 vp8/common/arm/neon/sixtappredict_neon.c
-vp8/common/arm/neon/variance_neon.c
-vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm.s
-vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm.s
-vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm.s
-vp8/common/arm/variance_arm.c
+vp8/common/arm/neon/vp8_loopfilter_neon.c
 vp8/common/blockd.c
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -75,27 +65,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -104,9 +92,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -119,29 +106,17 @@
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm.s
-vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_subtract_armv6.asm.s
-vp8/encoder/arm/armv6/walsh_v6.asm.s
-vp8/encoder/arm/boolhuff_arm.c
+vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
+vp8/encoder/arm/armv6/walsh_v6.asm
 vp8/encoder/arm/dct_arm.c
 vp8/encoder/arm/neon/denoising_neon.c
-vp8/encoder/arm/neon/fastquantizeb_neon.asm.s
-vp8/encoder/arm/neon/picklpf_arm.c
-vp8/encoder/arm/neon/shortfdct_neon.asm.s
-vp8/encoder/arm/neon/subtract_neon.asm.s
-vp8/encoder/arm/neon/vp8_memcpy_neon.asm.s
-vp8/encoder/arm/neon/vp8_mse16x16_neon.asm.s
-vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm.s
-vp8/encoder/arm/quantize_arm.c
+vp8/encoder/arm/neon/fastquantizeb_neon.c
+vp8/encoder/arm/neon/shortfdct_neon.c
+vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
+vp8/encoder/boolhuff.c
 vp8/encoder/boolhuff.h
 vp8/encoder/dct.c
 vp8/encoder/dct_value_cost.h
@@ -170,7 +145,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -182,45 +156,22 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
-vp8/vp8cx_arm.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
+vp8/vp8cx_arm.mk
 vp8/vp8dx.mk
-vp9/common/arm/neon/vp9_avg_neon.asm.s
-vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s
-vp9/common/arm/neon/vp9_convolve8_neon.asm.s
-vp9/common/arm/neon/vp9_convolve_neon.c
-vp9/common/arm/neon/vp9_copy_neon.asm.s
-vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct16x16_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct16x16_neon.c
-vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct32x32_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct4x4_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm.s
-vp9/common/arm/neon/vp9_idct8x8_add_neon.asm.s
-vp9/common/arm/neon/vp9_iht4x4_add_neon.asm.s
-vp9/common/arm/neon/vp9_iht8x8_add_neon.asm.s
-vp9/common/arm/neon/vp9_loopfilter_16_neon.asm.s
-vp9/common/arm/neon/vp9_loopfilter_16_neon.c
-vp9/common/arm/neon/vp9_loopfilter_neon.asm.s
-vp9/common/arm/neon/vp9_mb_lpf_neon.asm.s
-vp9/common/arm/neon/vp9_reconintra_neon.asm.s
-vp9/common/arm/neon/vp9_save_reg_neon.asm.s
+vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+vp9/common/arm/neon/vp9_iht8x8_add_neon.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -236,7 +187,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -245,8 +195,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -261,10 +209,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -279,21 +226,17 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
+vp9/encoder/arm/neon/vp9_avg_neon.c
 vp9/encoder/arm/neon/vp9_dct_neon.c
+vp9/encoder/arm/neon/vp9_error_neon.c
 vp9/encoder/arm/neon/vp9_quantize_neon.c
-vp9/encoder/arm/neon/vp9_sad_neon.c
-vp9/encoder/arm/neon/vp9_subtract_neon.c
-vp9/encoder/arm/neon/vp9_variance_neon.c
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -310,6 +253,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -334,9 +279,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -349,53 +295,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/arm_cpudetect.c
-vpx_ports/arm.h
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -403,3 +318,98 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/arm/bilinear_filter_media.asm
+vpx_dsp/arm/fwd_txfm_neon.c
+vpx_dsp/arm/idct16x16_1_add_neon.asm
+vpx_dsp/arm/idct16x16_add_neon.asm
+vpx_dsp/arm/idct16x16_neon.c
+vpx_dsp/arm/idct32x32_1_add_neon.asm
+vpx_dsp/arm/idct32x32_add_neon.asm
+vpx_dsp/arm/idct4x4_1_add_neon.asm
+vpx_dsp/arm/idct4x4_add_neon.asm
+vpx_dsp/arm/idct8x8_1_add_neon.asm
+vpx_dsp/arm/idct8x8_add_neon.asm
+vpx_dsp/arm/intrapred_neon.c
+vpx_dsp/arm/intrapred_neon_asm.asm
+vpx_dsp/arm/loopfilter_16_neon.asm
+vpx_dsp/arm/loopfilter_4_neon.asm
+vpx_dsp/arm/loopfilter_8_neon.asm
+vpx_dsp/arm/loopfilter_mb_neon.asm
+vpx_dsp/arm/loopfilter_neon.c
+vpx_dsp/arm/sad4d_neon.c
+vpx_dsp/arm/sad_media.asm
+vpx_dsp/arm/sad_neon.c
+vpx_dsp/arm/save_reg_neon.asm
+vpx_dsp/arm/subpel_variance_media.c
+vpx_dsp/arm/subpel_variance_neon.c
+vpx_dsp/arm/subtract_neon.c
+vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
+vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
+vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
+vpx_dsp/arm/variance_media.asm
+vpx_dsp/arm/variance_neon.c
+vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+vpx_dsp/arm/vpx_convolve_neon.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/arm.h
+vpx_ports/arm_cpudetect.c
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h
new file mode 100644
index 0000000..0b836c4
--- /dev/null
+++ b/config/arm-neon/vp8_rtcd.h

@@ -0,0 +1,242 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_neon
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_v6(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_v6(struct blockd*, short *dqc);
+void vp8_dequantize_b_neon(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_neon
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_neon
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_armv6(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_v6(short *input, short *output);
+void vp8_short_inv_walsh4x4_neon(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_armv6(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_armv6
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
new file mode 100644
index 0000000..e5134ff
--- /dev/null
+++ b/config/arm-neon/vp9_rtcd.h

@@ -0,0 +1,135 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_neon(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_neon
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_neon
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_neon
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_neon
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+void vp9_int_pro_row_neon(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_neon
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_neon
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_neon
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
new file mode 100644
index 0000000..6f03266
--- /dev/null
+++ b/config/arm-neon/vpx_config.asm

@@ -0,0 +1,89 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+	.equ DO1STROUNDING, 0
+.equ ARCH_ARM ,  1
+.equ ARCH_MIPS ,  0
+.equ ARCH_X86 ,  0
+.equ ARCH_X86_64 ,  0
+.equ HAVE_EDSP ,  0
+.equ HAVE_MEDIA ,  1
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_ASM ,  1
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_STDINT_H ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ HAVE_SYS_MMAN_H ,  1
+.equ HAVE_UNISTD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_USE_X86INC ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP10_ENCODER ,  0
+.equ CONFIG_VP10_DECODER ,  0
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_VP10 ,  0
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  1
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  0
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  0
+.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+	.section	.note.GNU-stack,"",%progbits

diff --git a/generic/vpx_config.c b/config/arm-neon/vpx_config.c
similarity index 72%
copy from generic/vpx_config.c
copy to config/arm-neon/vpx_config.c
index 61f4fd8..48dd606 100644
--- a/generic/vpx_config.c
+++ b/config/arm-neon/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/armv7a-neon/vpx_config.h b/config/arm-neon/vpx_config.h
similarity index 86%
rename from armv7a-neon/vpx_config.h
rename to config/arm-neon/vpx_config.h
index 2b09947..8d02c25 100644
--- a/armv7a-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 1
 #define ARCH_MIPS 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
-#define HAVE_EDSP 1
+#define HAVE_EDSP 0
 #define HAVE_MEDIA 1
 #define HAVE_NEON 1
 #define HAVE_NEON_ASM 1
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,19 +30,18 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
@@ -53,10 +52,6 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..4de075d
--- /dev/null
+++ b/config/arm-neon/vpx_dsp_rtcd.h

@@ -0,0 +1,765 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_neon
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_neon
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_neon
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_neon
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_neon
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_neon
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_neon
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_neon
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_neon
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_1024_add_neon
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_media(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_neon
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_media(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_neon
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_neon
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_neon
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_neon
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_neon
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_neon
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_neon
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_neon
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_neon
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_neon
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_neon
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_neon
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_neon
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_neon
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_neon
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_neon
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_neon
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_neon
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_neon
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_h_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_media
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_hv_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_media
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_v_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_media
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/armv7a-neon/vpx_scale_rtcd.h b/config/arm-neon/vpx_scale_rtcd.h
similarity index 90%
rename from armv7a-neon/vpx_scale_rtcd.h
rename to config/arm-neon/vpx_scale_rtcd.h
index 0a6d790..a1564b7 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/config/arm-neon/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/arm-neon/vpx_version.h
similarity index 60%
rename from x86/vpx_version.h
rename to config/arm-neon/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/arm-neon/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/armv7a/libvpx_srcs.txt b/config/arm/libvpx_srcs.txt
similarity index 76%
rename from armv7a/libvpx_srcs.txt
rename to config/arm/libvpx_srcs.txt
index 0165e7a..53c4fda 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/config/arm/libvpx_srcs.txt

@@ -1,40 +1,34 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
-vp8/common/arm/armv6/bilinearfilter_v6.asm.s
-vp8/common/arm/armv6/copymem16x16_v6.asm.s
-vp8/common/arm/armv6/copymem8x4_v6.asm.s
-vp8/common/arm/armv6/copymem8x8_v6.asm.s
-vp8/common/arm/armv6/dc_only_idct_add_v6.asm.s
-vp8/common/arm/armv6/dequant_idct_v6.asm.s
-vp8/common/arm/armv6/dequantize_v6.asm.s
-vp8/common/arm/armv6/filter_v6.asm.s
+vp8/common/arm/armv6/bilinearfilter_v6.asm
+vp8/common/arm/armv6/copymem16x16_v6.asm
+vp8/common/arm/armv6/copymem8x4_v6.asm
+vp8/common/arm/armv6/copymem8x8_v6.asm
+vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+vp8/common/arm/armv6/dequant_idct_v6.asm
+vp8/common/arm/armv6/dequantize_v6.asm
+vp8/common/arm/armv6/filter_v6.asm
 vp8/common/arm/armv6/idct_blk_v6.c
-vp8/common/arm/armv6/idct_v6.asm.s
-vp8/common/arm/armv6/intra4x4_predict_v6.asm.s
-vp8/common/arm/armv6/iwalsh_v6.asm.s
-vp8/common/arm/armv6/loopfilter_v6.asm.s
-vp8/common/arm/armv6/simpleloopfilter_v6.asm.s
-vp8/common/arm/armv6/sixtappredict8x4_v6.asm.s
-vp8/common/arm/armv6/vp8_sad16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance8x8_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm.s
+vp8/common/arm/armv6/idct_v6.asm
+vp8/common/arm/armv6/intra4x4_predict_v6.asm
+vp8/common/arm/armv6/iwalsh_v6.asm
+vp8/common/arm/armv6/loopfilter_v6.asm
+vp8/common/arm/armv6/simpleloopfilter_v6.asm
+vp8/common/arm/armv6/sixtappredict8x4_v6.asm
 vp8/common/arm/bilinearfilter_arm.c
 vp8/common/arm/bilinearfilter_arm.h
 vp8/common/arm/dequantize_arm.c
 vp8/common/arm/filter_arm.c
 vp8/common/arm/loopfilter_arm.c
-vp8/common/arm/variance_arm.c
 vp8/common/blockd.c
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -55,27 +49,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -84,9 +76,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -99,21 +90,13 @@
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm.s
-vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_subtract_armv6.asm.s
-vp8/encoder/arm/armv6/walsh_v6.asm.s
-vp8/encoder/arm/boolhuff_arm.c
+vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
+vp8/encoder/arm/armv6/walsh_v6.asm
 vp8/encoder/arm/dct_arm.c
-vp8/encoder/arm/quantize_arm.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
+vp8/encoder/boolhuff.c
 vp8/encoder/boolhuff.h
 vp8/encoder/dct.c
 vp8/encoder/dct_value_cost.h
@@ -142,7 +125,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -154,22 +136,20 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
-vp8/vp8cx_arm.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
+vp8/vp8cx_arm.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -185,7 +165,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -194,8 +173,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -210,10 +187,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -228,16 +204,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -254,6 +227,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -278,9 +253,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -293,53 +269,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/arm_cpudetect.c
-vpx_ports/arm.h
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -347,3 +292,70 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/arm/bilinear_filter_media.asm
+vpx_dsp/arm/sad_media.asm
+vpx_dsp/arm/subpel_variance_media.c
+vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
+vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
+vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
+vpx_dsp/arm/variance_media.asm
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/arm.h
+vpx_ports/arm_cpudetect.c
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/arm/vp8_rtcd.h b/config/arm/vp8_rtcd.h
new file mode 100644
index 0000000..7c2cefd
--- /dev/null
+++ b/config/arm/vp8_rtcd.h

@@ -0,0 +1,210 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_armv6
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_armv6
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_armv6
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_armv6
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_v6
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_v6
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_v6
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_v6
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_v6(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_v6(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_v6
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_armv6
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_armv6
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_armv6
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_armv6
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_armv6
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_armv6
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_armv6
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_armv6
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_armv6(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_armv6
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_armv6
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_v6_dual
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_v6(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_v6
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_armv6(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_armv6
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_armv6
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_armv6
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_armv6
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_armv6
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm/vp9_rtcd.h b/config/arm/vp9_rtcd.h
new file mode 100644
index 0000000..360f538
--- /dev/null
+++ b/config/arm/vp9_rtcd.h

@@ -0,0 +1,126 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_c
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_c
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm/vpx_config.asm b/config/arm/vpx_config.asm
new file mode 100644
index 0000000..2a69621
--- /dev/null
+++ b/config/arm/vpx_config.asm

@@ -0,0 +1,89 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+	.equ DO1STROUNDING, 0
+.equ ARCH_ARM ,  1
+.equ ARCH_MIPS ,  0
+.equ ARCH_X86 ,  0
+.equ ARCH_X86_64 ,  0
+.equ HAVE_EDSP ,  0
+.equ HAVE_MEDIA ,  1
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_STDINT_H ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ HAVE_SYS_MMAN_H ,  1
+.equ HAVE_UNISTD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_USE_X86INC ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP10_ENCODER ,  0
+.equ CONFIG_VP10_DECODER ,  0
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_VP10 ,  0
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  1
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  0
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  0
+.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+	.section	.note.GNU-stack,"",%progbits

diff --git a/generic/vpx_config.c b/config/arm/vpx_config.c
similarity index 72%
copy from generic/vpx_config.c
copy to config/arm/vpx_config.c
index 61f4fd8..a45c0d2 100644
--- a/generic/vpx_config.c
+++ b/config/arm/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=armv6-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/armv7a/vpx_config.h b/config/arm/vpx_config.h
similarity index 86%
rename from armv7a/vpx_config.h
rename to config/arm/vpx_config.h
index fb8b725..62b6285 100644
--- a/armv7a/vpx_config.h
+++ b/config/arm/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 1
 #define ARCH_MIPS 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
-#define HAVE_EDSP 1
+#define HAVE_EDSP 0
 #define HAVE_MEDIA 1
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,19 +30,18 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
@@ -53,10 +52,6 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/arm/vpx_dsp_rtcd.h b/config/arm/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..bb570a0
--- /dev/null
+++ b/config/arm/vpx_dsp_rtcd.h

@@ -0,0 +1,674 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_media(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_media
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_media(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_media
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_media
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_media
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_media
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_media(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_media
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_h_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_media
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_hv_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_media
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_v_media(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_media
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/armv7a-neon/vpx_scale_rtcd.h b/config/arm/vpx_scale_rtcd.h
similarity index 90%
copy from armv7a-neon/vpx_scale_rtcd.h
copy to config/arm/vpx_scale_rtcd.h
index 0a6d790..a1564b7 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/config/arm/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/arm/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/arm/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/arm/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/armv7a/libvpx_srcs.txt b/config/arm64/libvpx_srcs.txt
similarity index 70%
copy from armv7a/libvpx_srcs.txt
copy to config/arm64/libvpx_srcs.txt
index 0165e7a..483ffbb 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/config/arm64/libvpx_srcs.txt

@@ -1,40 +1,33 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
-vp8/common/arm/armv6/bilinearfilter_v6.asm.s
-vp8/common/arm/armv6/copymem16x16_v6.asm.s
-vp8/common/arm/armv6/copymem8x4_v6.asm.s
-vp8/common/arm/armv6/copymem8x8_v6.asm.s
-vp8/common/arm/armv6/dc_only_idct_add_v6.asm.s
-vp8/common/arm/armv6/dequant_idct_v6.asm.s
-vp8/common/arm/armv6/dequantize_v6.asm.s
-vp8/common/arm/armv6/filter_v6.asm.s
-vp8/common/arm/armv6/idct_blk_v6.c
-vp8/common/arm/armv6/idct_v6.asm.s
-vp8/common/arm/armv6/intra4x4_predict_v6.asm.s
-vp8/common/arm/armv6/iwalsh_v6.asm.s
-vp8/common/arm/armv6/loopfilter_v6.asm.s
-vp8/common/arm/armv6/simpleloopfilter_v6.asm.s
-vp8/common/arm/armv6/sixtappredict8x4_v6.asm.s
-vp8/common/arm/armv6/vp8_sad16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance16x16_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance8x8_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm.s
-vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm.s
-vp8/common/arm/bilinearfilter_arm.c
-vp8/common/arm/bilinearfilter_arm.h
 vp8/common/arm/dequantize_arm.c
 vp8/common/arm/filter_arm.c
 vp8/common/arm/loopfilter_arm.c
-vp8/common/arm/variance_arm.c
+vp8/common/arm/neon/bilinearpredict_neon.c
+vp8/common/arm/neon/copymem_neon.c
+vp8/common/arm/neon/dc_only_idct_add_neon.c
+vp8/common/arm/neon/dequant_idct_neon.c
+vp8/common/arm/neon/dequantizeb_neon.c
+vp8/common/arm/neon/idct_blk_neon.c
+vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+vp8/common/arm/neon/iwalsh_neon.c
+vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+vp8/common/arm/neon/mbloopfilter_neon.c
+vp8/common/arm/neon/reconintra_neon.c
+vp8/common/arm/neon/shortidct4x4llm_neon.c
+vp8/common/arm/neon/sixtappredict_neon.c
+vp8/common/arm/neon/vp8_loopfilter_neon.c
 vp8/common/blockd.c
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -55,27 +48,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -84,9 +75,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -99,21 +89,15 @@
 vp8/decoder/onyxd_int.h
 vp8/decoder/threading.c
 vp8/decoder/treereader.h
-vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
-vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm.s
-vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm.s
-vp8/encoder/arm/armv6/vp8_subtract_armv6.asm.s
-vp8/encoder/arm/armv6/walsh_v6.asm.s
-vp8/encoder/arm/boolhuff_arm.c
 vp8/encoder/arm/dct_arm.c
-vp8/encoder/arm/quantize_arm.c
+vp8/encoder/arm/neon/denoising_neon.c
+vp8/encoder/arm/neon/fastquantizeb_neon.c
+vp8/encoder/arm/neon/shortfdct_neon.c
+vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
 vp8/encoder/bitstream.c
 vp8/encoder/bitstream.h
 vp8/encoder/block.h
+vp8/encoder/boolhuff.c
 vp8/encoder/boolhuff.h
 vp8/encoder/dct.c
 vp8/encoder/dct_value_cost.h
@@ -142,7 +126,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -154,22 +137,22 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
-vp8/vp8cx_arm.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
+vp8/vp8cx_arm.mk
 vp8/vp8dx.mk
+vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+vp9/common/arm/neon/vp9_iht8x8_add_neon.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -185,7 +168,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -194,8 +176,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -210,10 +190,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -228,16 +207,17 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
+vp9/encoder/arm/neon/vp9_avg_neon.c
+vp9/encoder/arm/neon/vp9_dct_neon.c
+vp9/encoder/arm/neon/vp9_error_neon.c
+vp9/encoder/arm/neon/vp9_quantize_neon.c
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -254,6 +234,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -278,9 +260,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -293,53 +276,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/arm_cpudetect.c
-vpx_ports/arm.h
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -347,3 +299,88 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/arm/fwd_txfm_neon.c
+vpx_dsp/arm/idct16x16_1_add_neon.c
+vpx_dsp/arm/idct16x16_add_neon.c
+vpx_dsp/arm/idct16x16_neon.c
+vpx_dsp/arm/idct32x32_1_add_neon.c
+vpx_dsp/arm/idct32x32_add_neon.c
+vpx_dsp/arm/idct4x4_1_add_neon.c
+vpx_dsp/arm/idct4x4_add_neon.c
+vpx_dsp/arm/idct8x8_1_add_neon.c
+vpx_dsp/arm/idct8x8_add_neon.c
+vpx_dsp/arm/intrapred_neon.c
+vpx_dsp/arm/loopfilter_16_neon.c
+vpx_dsp/arm/loopfilter_4_neon.c
+vpx_dsp/arm/loopfilter_8_neon.c
+vpx_dsp/arm/loopfilter_neon.c
+vpx_dsp/arm/sad4d_neon.c
+vpx_dsp/arm/sad_neon.c
+vpx_dsp/arm/subpel_variance_neon.c
+vpx_dsp/arm/subtract_neon.c
+vpx_dsp/arm/variance_neon.c
+vpx_dsp/arm/vpx_convolve8_avg_neon.c
+vpx_dsp/arm/vpx_convolve8_neon.c
+vpx_dsp/arm/vpx_convolve_avg_neon.c
+vpx_dsp/arm/vpx_convolve_copy_neon.c
+vpx_dsp/arm/vpx_convolve_neon.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/arm.h
+vpx_ports/arm_cpudetect.c
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h
new file mode 100644
index 0000000..1f37629
--- /dev/null
+++ b/config/arm64/vp8_rtcd.h

@@ -0,0 +1,212 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_neon
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_neon
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_neon(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_neon
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_neon
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_neon(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
new file mode 100644
index 0000000..e5134ff
--- /dev/null
+++ b/config/arm64/vp9_rtcd.h

@@ -0,0 +1,135 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_neon(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_neon
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_neon
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_neon
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_neon
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+void vp9_int_pro_row_neon(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_neon
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_neon
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_neon
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
new file mode 100644
index 0000000..b6c1a52
--- /dev/null
+++ b/config/arm64/vpx_config.asm

@@ -0,0 +1,89 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+	.equ DO1STROUNDING, 0
+.equ ARCH_ARM ,  1
+.equ ARCH_MIPS ,  0
+.equ ARCH_X86 ,  0
+.equ ARCH_X86_64 ,  0
+.equ HAVE_EDSP ,  0
+.equ HAVE_MEDIA ,  0
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_STDINT_H ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ HAVE_SYS_MMAN_H ,  1
+.equ HAVE_UNISTD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_USE_X86INC ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP10_ENCODER ,  0
+.equ CONFIG_VP10_DECODER ,  0
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_VP10 ,  0
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  1
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  0
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  0
+.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+	.section	.note.GNU-stack,"",%progbits

diff --git a/generic/vpx_config.c b/config/arm64/vpx_config.c
similarity index 72%
copy from generic/vpx_config.c
copy to config/arm64/vpx_config.c
index 61f4fd8..50e1a4b 100644
--- a/generic/vpx_config.c
+++ b/config/arm64/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/generic/vpx_config.h b/config/arm64/vpx_config.h
similarity index 86%
copy from generic/vpx_config.h
copy to config/arm64/vpx_config.h
index 743623f..fb0eabc 100644
--- a/generic/vpx_config.h
+++ b/config/arm64/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
-#define ARCH_ARM 0
+#define INLINE      inline
+#define ARCH_ARM 1
 #define ARCH_MIPS 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
-#define HAVE_NEON 0
+#define HAVE_NEON 1
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,33 +30,28 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
 #define CONFIG_RVCT 0
 #define CONFIG_GCC 1
 #define CONFIG_MSVS 0
-#define CONFIG_PIC 0
+#define CONFIG_PIC 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -89,10 +87,12 @@
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..2cac9e6
--- /dev/null
+++ b/config/arm64/vpx_dsp_rtcd.h

@@ -0,0 +1,750 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_neon
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_neon
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_neon
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_neon
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_neon
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_neon
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_neon
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_neon
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_neon
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_neon
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_neon
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_neon
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_neon
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_neon
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_neon
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_neon
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_neon
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_neon
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_neon
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_neon
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_neon
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_neon
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_neon
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_neon
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_neon
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_neon
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_neon
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_neon
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_neon
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_c
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_c
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/armv7a-neon/vpx_scale_rtcd.h b/config/arm64/vpx_scale_rtcd.h
similarity index 90%
copy from armv7a-neon/vpx_scale_rtcd.h
copy to config/arm64/vpx_scale_rtcd.h
index 0a6d790..a1564b7 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/config/arm64/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/arm64/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/arm64/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/arm64/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/generic/libvpx_srcs.txt b/config/generic/libvpx_srcs.txt
similarity index 86%
rename from generic/libvpx_srcs.txt
rename to config/generic/libvpx_srcs.txt
index 0e70976..f6e76f0 100644
--- a/generic/libvpx_srcs.txt
+++ b/config/generic/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,27 +29,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -57,9 +56,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -104,7 +102,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -116,21 +113,19 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -146,7 +141,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -155,8 +149,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -171,10 +163,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -189,16 +180,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -215,6 +203,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -239,9 +229,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -254,51 +245,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -306,3 +268,61 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h
new file mode 100644
index 0000000..f5424bb
--- /dev/null
+++ b/config/generic/vp8_rtcd.h

@@ -0,0 +1,175 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h
new file mode 100644
index 0000000..d3eb4c5
--- /dev/null
+++ b/config/generic/vp9_rtcd.h

@@ -0,0 +1,121 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_c
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_c
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
new file mode 100644
index 0000000..b684cd2
--- /dev/null
+++ b/config/generic/vpx_config.asm

@@ -0,0 +1,89 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+	.equ DO1STROUNDING, 0
+.equ ARCH_ARM ,  0
+.equ ARCH_MIPS ,  0
+.equ ARCH_X86 ,  0
+.equ ARCH_X86_64 ,  0
+.equ HAVE_EDSP ,  0
+.equ HAVE_MEDIA ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_STDINT_H ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ HAVE_SYS_MMAN_H ,  1
+.equ HAVE_UNISTD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  1
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_USE_X86INC ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP10_ENCODER ,  0
+.equ CONFIG_VP10_DECODER ,  0
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_VP10 ,  0
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  1
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  0
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  0
+.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+	.section	.note.GNU-stack,"",%progbits

diff --git a/generic/vpx_config.c b/config/generic/vpx_config.c
similarity index 73%
rename from generic/vpx_config.c
rename to config/generic/vpx_config.c
index 61f4fd8..269eb82 100644
--- a/generic/vpx_config.c
+++ b/config/generic/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/generic/vpx_config.h b/config/generic/vpx_config.h
similarity index 87%
rename from generic/vpx_config.h
rename to config/generic/vpx_config.h
index 743623f..9cdca1f 100644
--- a/generic/vpx_config.h
+++ b/config/generic/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 0
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,33 +30,28 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
 #define CONFIG_RVCT 0
 #define CONFIG_GCC 1
 #define CONFIG_MSVS 0
-#define CONFIG_PIC 0
+#define CONFIG_PIC 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -89,10 +87,12 @@
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..010cbe7
--- /dev/null
+++ b/config/generic/vpx_dsp_rtcd.h

@@ -0,0 +1,660 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_c
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_c
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/generic/vpx_scale_rtcd.h b/config/generic/vpx_scale_rtcd.h
similarity index 90%
rename from generic/vpx_scale_rtcd.h
rename to config/generic/vpx_scale_rtcd.h
index f5e6caa..f419cc7 100644
--- a/generic/vpx_scale_rtcd.h
+++ b/config/generic/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/generic/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/generic/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/generic/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/mips-dspr2/libvpx_srcs.txt b/config/mips32-dspr2/libvpx_srcs.txt
similarity index 77%
rename from mips-dspr2/libvpx_srcs.txt
rename to config/mips32-dspr2/libvpx_srcs.txt
index f65bc7b..9ea5ede 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/config/mips32-dspr2/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,33 +29,31 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/mips/dspr2/dequantize_dspr2.c
 vp8/common/mips/dspr2/filter_dspr2.c
 vp8/common/mips/dspr2/idct_blk_dspr2.c
 vp8/common/mips/dspr2/idctllm_dspr2.c
-vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
 vp8/common/mips/dspr2/reconinter_dspr2.c
+vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -63,9 +62,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -110,7 +108,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -122,47 +119,22 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
-vp9/common/mips/dspr2/vp9_common_dspr2.h
-vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
-vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
-vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
-vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
-vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
 vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
-vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
-vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
 vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
 vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
-vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
-vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
-vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
-vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
-vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
-vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
-vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -178,7 +150,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -187,8 +158,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -203,10 +172,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -221,16 +189,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -247,6 +212,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -271,9 +238,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -286,30 +254,100 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
+vpx/src/vpx_codec.c
+vpx/src/vpx_decoder.c
+vpx/src/vpx_encoder.c
+vpx/src/vpx_image.c
+vpx/src/vpx_psnr.c
+vpx/vp8.h
+vpx/vp8cx.h
+vpx/vp8dx.h
+vpx/vpx_codec.h
+vpx/vpx_codec.mk
+vpx/vpx_decoder.h
+vpx/vpx_encoder.h
+vpx/vpx_frame_buffer.h
+vpx/vpx_image.h
+vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/mips/common_dspr2.c
+vpx_dsp/mips/common_dspr2.h
+vpx_dsp/mips/convolve2_avg_dspr2.c
+vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+vpx_dsp/mips/convolve2_dspr2.c
+vpx_dsp/mips/convolve2_horiz_dspr2.c
+vpx_dsp/mips/convolve2_vert_dspr2.c
+vpx_dsp/mips/convolve8_avg_dspr2.c
+vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+vpx_dsp/mips/convolve8_dspr2.c
+vpx_dsp/mips/convolve8_horiz_dspr2.c
+vpx_dsp/mips/convolve8_vert_dspr2.c
+vpx_dsp/mips/convolve_common_dspr2.h
+vpx_dsp/mips/intrapred16_dspr2.c
+vpx_dsp/mips/intrapred4_dspr2.c
+vpx_dsp/mips/intrapred8_dspr2.c
+vpx_dsp/mips/inv_txfm_dspr2.h
+vpx_dsp/mips/itrans16_dspr2.c
+vpx_dsp/mips/itrans32_cols_dspr2.c
+vpx_dsp/mips/itrans32_dspr2.c
+vpx_dsp/mips/itrans4_dspr2.c
+vpx_dsp/mips/itrans8_dspr2.c
+vpx_dsp/mips/loopfilter_filters_dspr2.c
+vpx_dsp/mips/loopfilter_filters_dspr2.h
+vpx_dsp/mips/loopfilter_macros_dspr2.h
+vpx_dsp/mips/loopfilter_masks_dspr2.h
+vpx_dsp/mips/loopfilter_mb_dspr2.c
+vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
 vpx_mem/include/vpx_mem_intrnl.h
 vpx_mem/vpx_mem.c
 vpx_mem/vpx_mem.h
 vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
+vpx_ports/bitops.h
 vpx_ports/emmintrin_compat.h
 vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
 vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
 vpx_ports/vpx_once.h
 vpx_ports/vpx_ports.mk
 vpx_ports/vpx_timer.h
@@ -318,24 +356,12 @@
 vpx_scale/generic/yv12config.c
 vpx_scale/generic/yv12extend.c
 vpx_scale/mips/dspr2/yv12extend_dspr2.c
-vpx_scale/vpx_scale_asm_offsets.c
 vpx_scale/vpx_scale.h
 vpx_scale/vpx_scale.mk
 vpx_scale/vpx_scale_rtcd.c
 vpx_scale/vpx_scale_rtcd.pl
 vpx_scale/yv12config.h
-vpx/src/vpx_codec.c
-vpx/src/vpx_decoder.c
-vpx/src/vpx_encoder.c
-vpx/src/vpx_image.c
-vpx/src/vpx_psnr.c
-vpx/vp8cx.h
-vpx/vp8dx.h
-vpx/vp8.h
-vpx/vpx_codec.h
-vpx/vpx_codec.mk
-vpx/vpx_decoder.h
-vpx/vpx_encoder.h
-vpx/vpx_frame_buffer.h
-vpx/vpx_image.h
-vpx/vpx_integer.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/mips32-dspr2/vp8_rtcd.h b/config/mips32-dspr2/vp8_rtcd.h
new file mode 100644
index 0000000..4442f6a
--- /dev/null
+++ b/config/mips32-dspr2/vp8_rtcd.h

@@ -0,0 +1,204 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_dspr2
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_dspr2
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_dspr2
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_dspr2(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_dspr2
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_dspr2(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_dspr2
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_dspr2
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_dspr2
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_dspr2
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_dspr2
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_dspr2
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_dspr2
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_dspr2
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_dspr2
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_dspr2
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_dspr2
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_dspr2
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_dspr2
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_dspr2
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/mips32-dspr2/vp9_rtcd.h b/config/mips32-dspr2/vp9_rtcd.h
new file mode 100644
index 0000000..7462519
--- /dev/null
+++ b/config/mips32-dspr2/vp9_rtcd.h

@@ -0,0 +1,135 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_dspr2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_dspr2
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_dspr2
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_dspr2
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_c
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_c
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/generic/vpx_config.c b/config/mips32-dspr2/vpx_config.c
similarity index 71%
copy from generic/vpx_config.c
copy to config/mips32-dspr2/vpx_config.c
index 61f4fd8..b2f14a3 100644
--- a/generic/vpx_config.c
+++ b/config/mips32-dspr2/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/mips-dspr2/vpx_config.h b/config/mips32-dspr2/vpx_config.h
similarity index 86%
rename from mips-dspr2/vpx_config.h
rename to config/mips32-dspr2/vpx_config.h
index 543d179..f0a0556 100644
--- a/mips-dspr2/vpx_config.h
+++ b/config/mips32-dspr2/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 1
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 1
 #define HAVE_DSPR2 1
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,33 +30,28 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
 #define CONFIG_RVCT 0
 #define CONFIG_GCC 1
 #define CONFIG_MSVS 0
-#define CONFIG_PIC 0
+#define CONFIG_PIC 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 0
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 1
 #define CONFIG_DC_RECON 1
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/mips32-dspr2/vpx_dsp_rtcd.h b/config/mips32-dspr2/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..b716181
--- /dev/null
+++ b/config/mips32-dspr2/vpx_dsp_rtcd.h

@@ -0,0 +1,709 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_dspr2
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_dspr2
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_dspr2
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_dspr2
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_dspr2
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_dspr2
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_dspr2
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_dspr2
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_dspr2
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_dspr2
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_dspr2
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_dspr2
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_dspr2
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_dspr2
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_dspr2
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_dspr2
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_dspr2
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_dspr2
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_dspr2
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_dspr2
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_dspr2
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_dspr2
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_dspr2
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_dspr2
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_dspr2
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_dspr2
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_4_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_dspr2
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_dspr2
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_8_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_dspr2
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_dspr2
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_dspr2
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_dspr2
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_4_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_dspr2
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_dspr2
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_8_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_dspr2
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_dspr2
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_dspr2
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_dspr2
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_c
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_c
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/mips-dspr2/vpx_scale_rtcd.h b/config/mips32-dspr2/vpx_scale_rtcd.h
similarity index 84%
rename from mips-dspr2/vpx_scale_rtcd.h
rename to config/mips32-dspr2/vpx_scale_rtcd.h
index 6678742..15b1b5a 100644
--- a/mips-dspr2/vpx_scale_rtcd.h
+++ b/config/mips32-dspr2/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,13 +40,13 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-void vp9_extend_frame_borders_dspr2(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_dspr2
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+void vpx_extend_frame_borders_dspr2(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_dspr2
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-void vp9_extend_frame_inner_borders_dspr2(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_dspr2
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+void vpx_extend_frame_inner_borders_dspr2(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_dspr2
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
@@ -59,13 +59,14 @@
 static void setup_rtcd_internal(void)
 {
 #if HAVE_DSPR2
+void vpx_dsputil_static_init();
 #if CONFIG_VP8
 void dsputil_static_init();
-dsputil_static_init();
 #endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
 #endif
 #endif
 }

diff --git a/x86/vpx_version.h b/config/mips32-dspr2/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/mips32-dspr2/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/mips32-dspr2/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/generic/libvpx_srcs.txt b/config/mips32/libvpx_srcs.txt
similarity index 86%
copy from generic/libvpx_srcs.txt
copy to config/mips32/libvpx_srcs.txt
index 0e70976..f6e76f0 100644
--- a/generic/libvpx_srcs.txt
+++ b/config/mips32/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,27 +29,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -57,9 +56,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -104,7 +102,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -116,21 +113,19 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -146,7 +141,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -155,8 +149,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -171,10 +163,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -189,16 +180,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -215,6 +203,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -239,9 +229,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -254,51 +245,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -306,3 +268,61 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/mips32/vp8_rtcd.h b/config/mips32/vp8_rtcd.h
new file mode 100644
index 0000000..28e23b3
--- /dev/null
+++ b/config/mips32/vp8_rtcd.h

@@ -0,0 +1,186 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/mips32/vp9_rtcd.h b/config/mips32/vp9_rtcd.h
new file mode 100644
index 0000000..28d653f
--- /dev/null
+++ b/config/mips32/vp9_rtcd.h

@@ -0,0 +1,132 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_c
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_c
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/generic/vpx_config.c b/config/mips32/vpx_config.c
similarity index 71%
copy from generic/vpx_config.c
copy to config/mips32/vpx_config.c
index 61f4fd8..81edcc9 100644
--- a/generic/vpx_config.c
+++ b/config/mips32/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/mips/vpx_config.h b/config/mips32/vpx_config.h
similarity index 86%
rename from mips/vpx_config.h
rename to config/mips32/vpx_config.h
index 79594f8..1bc7afa 100644
--- a/mips/vpx_config.h
+++ b/config/mips32/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 1
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 1
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,33 +30,28 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
 #define CONFIG_RVCT 0
 #define CONFIG_GCC 1
 #define CONFIG_MSVS 0
-#define CONFIG_PIC 0
+#define CONFIG_PIC 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 1
 #define CONFIG_DC_RECON 1
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/mips32/vpx_dsp_rtcd.h b/config/mips32/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..cff36af
--- /dev/null
+++ b/config/mips32/vpx_dsp_rtcd.h

@@ -0,0 +1,671 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_c
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_c
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/mips/vpx_scale_rtcd.h b/config/mips32/vpx_scale_rtcd.h
similarity index 87%
rename from mips/vpx_scale_rtcd.h
rename to config/mips32/vpx_scale_rtcd.h
index d26e413..ea70efc 100644
--- a/mips/vpx_scale_rtcd.h
+++ b/config/mips32/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
@@ -57,13 +57,14 @@
 static void setup_rtcd_internal(void)
 {
 #if HAVE_DSPR2
+void vpx_dsputil_static_init();
 #if CONFIG_VP8
 void dsputil_static_init();
-dsputil_static_init();
 #endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
 #endif
 #endif
 }

diff --git a/x86/vpx_version.h b/config/mips32/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/mips32/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/mips32/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/generic/libvpx_srcs.txt b/config/mips64/libvpx_srcs.txt
similarity index 86%
copy from generic/libvpx_srcs.txt
copy to config/mips64/libvpx_srcs.txt
index 0e70976..f6e76f0 100644
--- a/generic/libvpx_srcs.txt
+++ b/config/mips64/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,27 +29,25 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/ppflags.h
 vp8/common/quant_common.c
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -57,9 +56,8 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -104,7 +102,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -116,21 +113,19 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -146,7 +141,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -155,8 +149,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -171,10 +163,9 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
 vp9/decoder/vp9_decodeframe.c
@@ -189,16 +180,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -215,6 +203,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -239,9 +229,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -254,51 +245,22 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -306,3 +268,61 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/mips64/vp8_rtcd.h b/config/mips64/vp8_rtcd.h
new file mode 100644
index 0000000..28e23b3
--- /dev/null
+++ b/config/mips64/vp8_rtcd.h

@@ -0,0 +1,186 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
+
+void vp8_clear_system_state_c();
+#define vp8_clear_system_state vp8_clear_system_state_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sad_c
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/mips64/vp9_rtcd.h b/config/mips64/vp9_rtcd.h
new file mode 100644
index 0000000..28d653f
--- /dev/null
+++ b/config/mips64/vp9_rtcd.h

@@ -0,0 +1,132 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_c
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sad_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_c
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_c
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_c
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/generic/vpx_config.c b/config/mips64/vpx_config.c
similarity index 72%
copy from generic/vpx_config.c
copy to config/mips64/vpx_config.c
index 61f4fd8..3b3bb49 100644
--- a/generic/vpx_config.c
+++ b/config/mips64/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=mips64-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/mips/vpx_config.h b/config/mips64/vpx_config.h
similarity index 85%
copy from mips/vpx_config.h
copy to config/mips64/vpx_config.h
index 79594f8..f19731b 100644
--- a/mips/vpx_config.h
+++ b/config/mips64/vpx_config.h

@@ -9,19 +9,19 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 1
 #define ARCH_X86 0
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
-#define HAVE_MIPS32 1
+#define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 1
 #define HAVE_MMX 0
 #define HAVE_SSE 0
 #define HAVE_SSE2 0
@@ -30,33 +30,28 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
-#define CONFIG_USE_X86INC 1
+#define CONFIG_USE_X86INC 0
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
 #define CONFIG_RVCT 0
 #define CONFIG_GCC 1
 #define CONFIG_MSVS 0
-#define CONFIG_PIC 0
+#define CONFIG_PIC 1
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 1
 #define CONFIG_DC_RECON 1
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/mips64/vpx_dsp_rtcd.h b/config/mips64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..cff36af
--- /dev/null
+++ b/config/mips64/vpx_dsp_rtcd.h

@@ -0,0 +1,671 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_c
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_c
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/mips/vpx_scale_rtcd.h b/config/mips64/vpx_scale_rtcd.h
similarity index 87%
copy from mips/vpx_scale_rtcd.h
copy to config/mips64/vpx_scale_rtcd.h
index d26e413..ea70efc 100644
--- a/mips/vpx_scale_rtcd.h
+++ b/config/mips64/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
@@ -57,13 +57,14 @@
 static void setup_rtcd_internal(void)
 {
 #if HAVE_DSPR2
+void vpx_dsputil_static_init();
 #if CONFIG_VP8
 void dsputil_static_init();
-dsputil_static_init();
 #endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
 #endif
 #endif
 }

diff --git a/x86/vpx_version.h b/config/mips64/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/mips64/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/mips64/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/x86/libvpx_srcs.txt b/config/x86/libvpx_srcs.txt
similarity index 77%
rename from x86/libvpx_srcs.txt
rename to config/x86/libvpx_srcs.txt
index 91a8b84..af63fd8 100644
--- a/x86/libvpx_srcs.txt
+++ b/config/x86/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,17 +29,16 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/mfqe.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/postproc.c
 vp8/common/postproc.h
 vp8/common/ppflags.h
@@ -46,12 +46,11 @@
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -60,9 +59,10 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
+vp8/common/x86/copy_sse2.asm
+vp8/common/x86/copy_sse3.asm
 vp8/common/x86/dequantize_mmx.asm
 vp8/common/x86/filter_x86.c
 vp8/common/x86/filter_x86.h
@@ -72,7 +72,6 @@
 vp8/common/x86/idctllm_sse2.asm
 vp8/common/x86/iwalsh_mmx.asm
 vp8/common/x86/iwalsh_sse2.asm
-vp8/common/x86/loopfilter_mmx.asm
 vp8/common/x86/loopfilter_sse2.asm
 vp8/common/x86/loopfilter_x86.c
 vp8/common/x86/mfqe_sse2.asm
@@ -81,15 +80,11 @@
 vp8/common/x86/recon_mmx.asm
 vp8/common/x86/recon_sse2.asm
 vp8/common/x86/recon_wrapper_sse2.c
-vp8/common/x86/sad_mmx.asm
-vp8/common/x86/sad_sse2.asm
 vp8/common/x86/subpixel_mmx.asm
 vp8/common/x86/subpixel_sse2.asm
-vp8/common/x86/variance_impl_mmx.asm
-vp8/common/x86/variance_impl_sse2.asm
-vp8/common/x86/variance_mmx.c
-vp8/common/x86/variance_sse2.c
+vp8/common/x86/subpixel_ssse3.asm
 vp8/common/x86/vp8_asm_stubs.c
+vp8/common/x86/vp8_loopfilter_mmx.asm
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -134,7 +129,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -146,32 +140,29 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/encoder/x86/dct_mmx.asm
 vp8/encoder/x86/dct_sse2.asm
 vp8/encoder/x86/denoising_sse2.c
 vp8/encoder/x86/encodeopt.asm
 vp8/encoder/x86/fwalsh_sse2.asm
 vp8/encoder/x86/quantize_mmx.asm
-vp8/encoder/x86/quantize_sse2.c
-vp8/encoder/x86/subtract_mmx.asm
-vp8/encoder/x86/subtract_sse2.asm
+vp8/encoder/x86/quantize_ssse3.c
 vp8/encoder/x86/vp8_enc_stubs_mmx.c
 vp8/encoder/x86/vp8_enc_stubs_sse2.c
+vp8/encoder/x86/vp8_quantize_sse2.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -187,7 +178,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -196,8 +186,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -212,21 +200,12 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
-vp9/common/x86/vp9_asm_stubs.c
-vp9/common/x86/vp9_copy_sse2.asm
 vp9/common/x86/vp9_idct_intrin_sse2.c
-vp9/common/x86/vp9_idct_intrin_sse2.h
-vp9/common/x86/vp9_intrapred_sse2.asm
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c
-vp9/common/x86/vp9_loopfilter_mmx.asm
-vp9/common/x86/vp9_subpixel_8t_sse2.asm
-vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
 vp9/decoder/vp9_decodeframe.c
 vp9/decoder/vp9_decodeframe.h
 vp9/decoder/vp9_decodemv.c
@@ -239,16 +218,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -265,6 +241,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -289,9 +267,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -304,68 +283,29 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
-vp9/encoder/x86/vp9_dct32x32_sse2.c
+vp9/encoder/x86/vp9_avg_intrin_sse2.c
 vp9/encoder/x86/vp9_dct_mmx.asm
 vp9/encoder/x86/vp9_dct_sse2.c
+vp9/encoder/x86/vp9_dct_ssse3.c
 vp9/encoder/x86/vp9_error_sse2.asm
-vp9/encoder/x86/vp9_sad4d_sse2.asm
-vp9/encoder/x86/vp9_sad_mmx.asm
-vp9/encoder/x86/vp9_sad_sse2.asm
-vp9/encoder/x86/vp9_subpel_variance.asm
-vp9/encoder/x86/vp9_subtract_sse2.asm
+vp9/encoder/x86/vp9_quantize_sse2.c
 vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
-vp9/encoder/x86/vp9_variance_impl_mmx.asm
-vp9/encoder/x86/vp9_variance_impl_sse2.asm
-vp9/encoder/x86/vp9_variance_mmx.c
-vp9/encoder/x86/vp9_variance_sse2.c
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/emms.asm
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_ports/x86_abi_support.asm
-vpx_ports/x86.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -373,3 +313,95 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_dsp/x86/convolve.h
+vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+vpx_dsp/x86/fwd_txfm_impl_sse2.h
+vpx_dsp/x86/fwd_txfm_sse2.c
+vpx_dsp/x86/fwd_txfm_sse2.h
+vpx_dsp/x86/intrapred_sse2.asm
+vpx_dsp/x86/intrapred_ssse3.asm
+vpx_dsp/x86/inv_txfm_sse2.c
+vpx_dsp/x86/inv_txfm_sse2.h
+vpx_dsp/x86/inv_wht_sse2.asm
+vpx_dsp/x86/loopfilter_mmx.asm
+vpx_dsp/x86/loopfilter_sse2.c
+vpx_dsp/x86/quantize_sse2.c
+vpx_dsp/x86/sad4d_sse2.asm
+vpx_dsp/x86/sad_mmx.asm
+vpx_dsp/x86/sad_sse2.asm
+vpx_dsp/x86/sad_sse3.asm
+vpx_dsp/x86/sad_ssse3.asm
+vpx_dsp/x86/subpel_variance_sse2.asm
+vpx_dsp/x86/subtract_sse2.asm
+vpx_dsp/x86/txfm_common_sse2.h
+vpx_dsp/x86/variance_impl_mmx.asm
+vpx_dsp/x86/variance_mmx.c
+vpx_dsp/x86/variance_sse2.c
+vpx_dsp/x86/vpx_asm_stubs.c
+vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/emms.asm
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_ports/x86.h
+vpx_ports/x86_abi_support.asm
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
new file mode 100644
index 0000000..fc714f4
--- /dev/null
+++ b/config/x86/vp8_rtcd.h

@@ -0,0 +1,294 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_ssse3
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3
+
+void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_b vp8_blend_b_c
+
+void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_mb_inner vp8_blend_mb_inner_c
+
+void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+int vp8_block_error_mmx(short *coeff, short *dqcoeff);
+int vp8_block_error_xmm(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_xmm
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
+
+void vp8_clear_system_state_c();
+void vpx_reset_mmx_state();
+#define vp8_clear_system_state vpx_reset_mmx_state
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+#define vp8_copy32xn vp8_copy32xn_sse3
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_sse2
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_mmx
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sadx4
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_sse2(struct block *, struct blockd *);
+void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_ssse3
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sadx3
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+int vp8_mbblock_error_mmx(struct macroblock *mb, int dc);
+int vp8_mbblock_error_xmm(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_xmm
+
+void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm
+
+int vp8_mbuverror_c(struct macroblock *mb);
+int vp8_mbuverror_mmx(struct macroblock *mb);
+int vp8_mbuverror_xmm(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_xmm
+
+void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+#define vp8_plane_add_noise vp8_plane_add_noise_wmt
+
+void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_mmx(short *input, short *output);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_ssse3
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_ssse3
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_ssse3
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_ssse3
+
+void vp8_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
new file mode 100644
index 0000000..dcaa6ea
--- /dev/null
+++ b/config/x86/vp9_rtcd.h

@@ -0,0 +1,147 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+unsigned int vp9_avg_4x4_sse2(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_sse2
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_sse2
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_sse2
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_sse2
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_sse2
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_sse2
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_sse2
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sadx3
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_mmx
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_sse2
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_sse2
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_sse2
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_sse2
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_sse2
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_sse2
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+int16_t vp9_satd_sse2(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_sse2
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_sse2
+
+void vp9_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
new file mode 100644
index 0000000..2b7f1cc
--- /dev/null
+++ b/config/x86/vpx_config.asm

@@ -0,0 +1,85 @@
+%define ARCH_ARM 0
+%define ARCH_MIPS 0
+%define ARCH_X86 1
+%define ARCH_X86_64 0
+%define HAVE_EDSP 0
+%define HAVE_MEDIA 0
+%define HAVE_NEON 0
+%define HAVE_NEON_ASM 0
+%define HAVE_MIPS32 0
+%define HAVE_DSPR2 0
+%define HAVE_MSA 0
+%define HAVE_MIPS64 0
+%define HAVE_MMX 1
+%define HAVE_SSE 1
+%define HAVE_SSE2 1
+%define HAVE_SSE3 1
+%define HAVE_SSSE3 1
+%define HAVE_SSE4_1 0
+%define HAVE_AVX 0
+%define HAVE_AVX2 0
+%define HAVE_VPX_PORTS 1
+%define HAVE_STDINT_H 1
+%define HAVE_PTHREAD_H 1
+%define HAVE_SYS_MMAN_H 1
+%define HAVE_UNISTD_H 1
+%define CONFIG_DEPENDENCY_TRACKING 1
+%define CONFIG_EXTERNAL_BUILD 1
+%define CONFIG_INSTALL_DOCS 1
+%define CONFIG_INSTALL_BINS 1
+%define CONFIG_INSTALL_LIBS 1
+%define CONFIG_INSTALL_SRCS 0
+%define CONFIG_USE_X86INC 1
+%define CONFIG_DEBUG 0
+%define CONFIG_GPROF 0
+%define CONFIG_GCOV 0
+%define CONFIG_RVCT 0
+%define CONFIG_GCC 1
+%define CONFIG_MSVS 0
+%define CONFIG_PIC 1
+%define CONFIG_BIG_ENDIAN 0
+%define CONFIG_CODEC_SRCS 0
+%define CONFIG_DEBUG_LIBS 0
+%define CONFIG_DEQUANT_TOKENS 0
+%define CONFIG_DC_RECON 0
+%define CONFIG_RUNTIME_CPU_DETECT 0
+%define CONFIG_POSTPROC 1
+%define CONFIG_VP9_POSTPROC 0
+%define CONFIG_MULTITHREAD 1
+%define CONFIG_INTERNAL_STATS 0
+%define CONFIG_VP8_ENCODER 1
+%define CONFIG_VP8_DECODER 1
+%define CONFIG_VP9_ENCODER 1
+%define CONFIG_VP9_DECODER 1
+%define CONFIG_VP10_ENCODER 0
+%define CONFIG_VP10_DECODER 0
+%define CONFIG_VP8 1
+%define CONFIG_VP9 1
+%define CONFIG_VP10 0
+%define CONFIG_ENCODERS 1
+%define CONFIG_DECODERS 1
+%define CONFIG_STATIC_MSVCRT 0
+%define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_REALTIME_ONLY 1
+%define CONFIG_ONTHEFLY_BITPACKING 0
+%define CONFIG_ERROR_CONCEALMENT 0
+%define CONFIG_SHARED 0
+%define CONFIG_STATIC 1
+%define CONFIG_SMALL 0
+%define CONFIG_POSTPROC_VISUALIZER 0
+%define CONFIG_OS_SUPPORT 1
+%define CONFIG_UNIT_TESTS 1
+%define CONFIG_WEBM_IO 1
+%define CONFIG_LIBYUV 1
+%define CONFIG_DECODE_PERF_TESTS 0
+%define CONFIG_ENCODE_PERF_TESTS 0
+%define CONFIG_MULTI_RES_ENCODING 0
+%define CONFIG_TEMPORAL_DENOISING 1
+%define CONFIG_VP9_TEMPORAL_DENOISING 0
+%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_EXPERIMENTAL 0
+%define CONFIG_SIZE_LIMIT 0
+%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_FP_MB_STATS 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/generic/vpx_config.c b/config/x86/vpx_config.c
similarity index 67%
copy from generic/vpx_config.c
copy to config/x86/vpx_config.c
index 61f4fd8..4b7c558 100644
--- a/generic/vpx_config.c
+++ b/config/x86/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/x86/vpx_config.h b/config/x86/vpx_config.h
similarity index 86%
rename from x86/vpx_config.h
rename to config/x86/vpx_config.h
index dd5114d..634c67b 100644
--- a/x86/vpx_config.h
+++ b/config/x86/vpx_config.h

@@ -9,35 +9,34 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 0
 #define ARCH_X86 1
 #define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 1
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
-#define HAVE_SSE3 0
-#define HAVE_SSSE3 0
+#define HAVE_SSE3 1
+#define HAVE_SSSE3 1
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
@@ -53,10 +52,6 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..64ee53f
--- /dev/null
+++ b/config/x86/vpx_dsp_rtcd.h

@@ -0,0 +1,894 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_ssse3
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_ssse3
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_ssse3
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_sse2
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_sse2
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_ssse3
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_ssse3
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_ssse3
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_ssse3
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_ssse3
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_ssse3
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_ssse3
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_ssse3
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_ssse3
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_ssse3
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_ssse3
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_ssse3
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_ssse3
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_ssse3
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_ssse3
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_ssse3
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_sse2
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_sse2
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_sse2
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_sse2
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_sse2
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_sse2
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_sse2
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_sse2
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_sse2
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_sse2
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+unsigned int vpx_get_mb_ss_mmx(const int16_t *);
+unsigned int vpx_get_mb_ss_sse2(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_sse2
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_ssse3
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_ssse3
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_ssse3
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_ssse3
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_sse2
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_sse2
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_sse2
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_mmx
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_sse2
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_sse2
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_sse2
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_sse2
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_sse2
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_mmx
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_sse2
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_sse2
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_sse2
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_sse2
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_sse2
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_sse2
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_sse2
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_sse2
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_ssse3
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_sse2
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_sse2
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_ssse3
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_sse2
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_sse2
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_sse2
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_sse
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_sse
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_sse3
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_sse
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_sse
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_sse
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_sse
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_sse2
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_sse2
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_sse2
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_sse3
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_sse2
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_sse2
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_sse3
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_ssse3
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_ssse3
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_ssse3
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_ssse3
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_ssse3
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_ssse3
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_ssse3
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_ssse3
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_ssse3
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_ssse3
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_ssse3
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_ssse3
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_ssse3
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_ssse3
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_ssse3
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_sse2
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_sse2
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_sse2
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_sse2
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_sse2
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_sse2
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_sse2
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_sse2
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_sse2
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_sse2
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_sse2
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_sse2
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_sse2
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_sse2
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_mmx
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_mmx
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_mmx
+
+void vpx_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/x86/vpx_scale_rtcd.h b/config/x86/vpx_scale_rtcd.h
similarity index 90%
rename from x86/vpx_scale_rtcd.h
rename to config/x86/vpx_scale_rtcd.h
index 7487e5f..ddf7d01 100644
--- a/x86/vpx_scale_rtcd.h
+++ b/config/x86/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/x86/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/x86/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/x86/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/x86/libvpx_srcs.txt b/config/x86_64/libvpx_srcs.txt
similarity index 75%
copy from x86/libvpx_srcs.txt
copy to config/x86_64/libvpx_srcs.txt
index 91a8b84..ac3de52 100644
--- a/x86/libvpx_srcs.txt
+++ b/config/x86_64/libvpx_srcs.txt

@@ -1,6 +1,6 @@
+CHANGELOG
 build/make/rtcd.pl
 build/make/version.sh
-CHANGELOG
 libs.mk
 vp8/common/alloccommon.c
 vp8/common/alloccommon.h
@@ -8,6 +8,7 @@
 vp8/common/blockd.h
 vp8/common/coefupdateprobs.h
 vp8/common/common.h
+vp8/common/copy_c.c
 vp8/common/debugmodes.c
 vp8/common/default_coef_probs.h
 vp8/common/dequantize.c
@@ -28,17 +29,16 @@
 vp8/common/idct_blk.c
 vp8/common/idctllm.c
 vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
 vp8/common/loopfilter.h
+vp8/common/loopfilter_filters.c
 vp8/common/mbpitch.c
 vp8/common/mfqe.c
 vp8/common/modecont.c
 vp8/common/modecont.h
 vp8/common/mv.h
+vp8/common/onyx.h
 vp8/common/onyxc_int.h
 vp8/common/onyxd.h
-vp8/common/onyx.h
 vp8/common/postproc.c
 vp8/common/postproc.h
 vp8/common/ppflags.h
@@ -46,12 +46,11 @@
 vp8/common/quant_common.h
 vp8/common/reconinter.c
 vp8/common/reconinter.h
+vp8/common/reconintra.c
 vp8/common/reconintra4x4.c
 vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
 vp8/common/rtcd.c
 vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
 vp8/common/setupintrarecon.c
 vp8/common/setupintrarecon.h
 vp8/common/swapyv12buffer.c
@@ -60,9 +59,10 @@
 vp8/common/threading.h
 vp8/common/treecoder.c
 vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
 vp8/common/vp8_entropymodedata.h
+vp8/common/vp8_loopfilter.c
+vp8/common/x86/copy_sse2.asm
+vp8/common/x86/copy_sse3.asm
 vp8/common/x86/dequantize_mmx.asm
 vp8/common/x86/filter_x86.c
 vp8/common/x86/filter_x86.h
@@ -72,7 +72,7 @@
 vp8/common/x86/idctllm_sse2.asm
 vp8/common/x86/iwalsh_mmx.asm
 vp8/common/x86/iwalsh_sse2.asm
-vp8/common/x86/loopfilter_mmx.asm
+vp8/common/x86/loopfilter_block_sse2_x86_64.asm
 vp8/common/x86/loopfilter_sse2.asm
 vp8/common/x86/loopfilter_x86.c
 vp8/common/x86/mfqe_sse2.asm
@@ -81,15 +81,11 @@
 vp8/common/x86/recon_mmx.asm
 vp8/common/x86/recon_sse2.asm
 vp8/common/x86/recon_wrapper_sse2.c
-vp8/common/x86/sad_mmx.asm
-vp8/common/x86/sad_sse2.asm
 vp8/common/x86/subpixel_mmx.asm
 vp8/common/x86/subpixel_sse2.asm
-vp8/common/x86/variance_impl_mmx.asm
-vp8/common/x86/variance_impl_sse2.asm
-vp8/common/x86/variance_mmx.c
-vp8/common/x86/variance_sse2.c
+vp8/common/x86/subpixel_ssse3.asm
 vp8/common/x86/vp8_asm_stubs.c
+vp8/common/x86/vp8_loopfilter_mmx.asm
 vp8/decoder/dboolhuff.c
 vp8/decoder/dboolhuff.h
 vp8/decoder/decodeframe.c
@@ -134,7 +130,6 @@
 vp8/encoder/pickinter.c
 vp8/encoder/pickinter.h
 vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
 vp8/encoder/quantize.h
 vp8/encoder/ratectrl.c
 vp8/encoder/ratectrl.h
@@ -146,32 +141,29 @@
 vp8/encoder/tokenize.h
 vp8/encoder/treewriter.c
 vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
+vp8/encoder/vp8_quantize.c
 vp8/encoder/x86/dct_mmx.asm
 vp8/encoder/x86/dct_sse2.asm
 vp8/encoder/x86/denoising_sse2.c
 vp8/encoder/x86/encodeopt.asm
 vp8/encoder/x86/fwalsh_sse2.asm
 vp8/encoder/x86/quantize_mmx.asm
-vp8/encoder/x86/quantize_sse2.c
-vp8/encoder/x86/subtract_mmx.asm
-vp8/encoder/x86/subtract_sse2.asm
+vp8/encoder/x86/quantize_ssse3.c
 vp8/encoder/x86/vp8_enc_stubs_mmx.c
 vp8/encoder/x86/vp8_enc_stubs_sse2.c
+vp8/encoder/x86/vp8_quantize_sse2.c
 vp8/vp8_common.mk
 vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
 vp8/vp8_dx_iface.c
+vp8/vp8cx.mk
 vp8/vp8dx.mk
 vp9/common/vp9_alloccommon.c
 vp9/common/vp9_alloccommon.h
 vp9/common/vp9_blockd.c
 vp9/common/vp9_blockd.h
+vp9/common/vp9_common.h
 vp9/common/vp9_common_data.c
 vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
 vp9/common/vp9_debugmodes.c
 vp9/common/vp9_entropy.c
 vp9/common/vp9_entropy.h
@@ -187,7 +179,6 @@
 vp9/common/vp9_idct.c
 vp9/common/vp9_idct.h
 vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
 vp9/common/vp9_loopfilter.h
 vp9/common/vp9_mv.h
 vp9/common/vp9_mvref_common.c
@@ -196,8 +187,6 @@
 vp9/common/vp9_ppflags.h
 vp9/common/vp9_pred_common.c
 vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
 vp9/common/vp9_quant_common.c
 vp9/common/vp9_quant_common.h
 vp9/common/vp9_reconinter.c
@@ -212,21 +201,12 @@
 vp9/common/vp9_scan.h
 vp9/common/vp9_seg_common.c
 vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
 vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
+vp9/common/vp9_thread_common.c
+vp9/common/vp9_thread_common.h
 vp9/common/vp9_tile_common.c
 vp9/common/vp9_tile_common.h
-vp9/common/x86/vp9_asm_stubs.c
-vp9/common/x86/vp9_copy_sse2.asm
 vp9/common/x86/vp9_idct_intrin_sse2.c
-vp9/common/x86/vp9_idct_intrin_sse2.h
-vp9/common/x86/vp9_intrapred_sse2.asm
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c
-vp9/common/x86/vp9_loopfilter_mmx.asm
-vp9/common/x86/vp9_subpixel_8t_sse2.asm
-vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
 vp9/decoder/vp9_decodeframe.c
 vp9/decoder/vp9_decodeframe.h
 vp9/decoder/vp9_decodemv.c
@@ -239,16 +219,13 @@
 vp9/decoder/vp9_dsubexp.h
 vp9/decoder/vp9_dthread.c
 vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
 vp9/encoder/vp9_aq_complexity.c
 vp9/encoder/vp9_aq_complexity.h
 vp9/encoder/vp9_aq_cyclicrefresh.c
 vp9/encoder/vp9_aq_cyclicrefresh.h
 vp9/encoder/vp9_aq_variance.c
 vp9/encoder/vp9_aq_variance.h
+vp9/encoder/vp9_avg.c
 vp9/encoder/vp9_bitstream.c
 vp9/encoder/vp9_bitstream.h
 vp9/encoder/vp9_block.h
@@ -265,6 +242,8 @@
 vp9/encoder/vp9_encodemv.h
 vp9/encoder/vp9_encoder.c
 vp9/encoder/vp9_encoder.h
+vp9/encoder/vp9_ethread.c
+vp9/encoder/vp9_ethread.h
 vp9/encoder/vp9_extend.c
 vp9/encoder/vp9_extend.h
 vp9/encoder/vp9_firstpass.c
@@ -289,9 +268,10 @@
 vp9/encoder/vp9_rdopt.h
 vp9/encoder/vp9_resize.c
 vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
 vp9/encoder/vp9_segmentation.c
 vp9/encoder/vp9_segmentation.h
+vp9/encoder/vp9_skin_detection.c
+vp9/encoder/vp9_skin_detection.h
 vp9/encoder/vp9_speed_features.c
 vp9/encoder/vp9_speed_features.h
 vp9/encoder/vp9_subexp.c
@@ -304,68 +284,31 @@
 vp9/encoder/vp9_tokenize.h
 vp9/encoder/vp9_treewriter.c
 vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
-vp9/encoder/x86/vp9_dct32x32_sse2.c
+vp9/encoder/x86/vp9_avg_intrin_sse2.c
 vp9/encoder/x86/vp9_dct_mmx.asm
 vp9/encoder/x86/vp9_dct_sse2.c
+vp9/encoder/x86/vp9_dct_ssse3.c
+vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
 vp9/encoder/x86/vp9_error_sse2.asm
-vp9/encoder/x86/vp9_sad4d_sse2.asm
-vp9/encoder/x86/vp9_sad_mmx.asm
-vp9/encoder/x86/vp9_sad_sse2.asm
-vp9/encoder/x86/vp9_subpel_variance.asm
-vp9/encoder/x86/vp9_subtract_sse2.asm
+vp9/encoder/x86/vp9_quantize_sse2.c
+vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
 vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
-vp9/encoder/x86/vp9_variance_impl_mmx.asm
-vp9/encoder/x86/vp9_variance_impl_sse2.asm
-vp9/encoder/x86/vp9_variance_mmx.c
-vp9/encoder/x86/vp9_variance_sse2.c
 vp9/vp9_common.mk
 vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
 vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
 vp9/vp9_iface_common.h
-vpx_config.c
+vp9/vp9cx.mk
+vp9/vp9dx.mk
 vpx/internal/vpx_codec_internal.h
 vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/emms.asm
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_ports/x86_abi_support.asm
-vpx_ports/x86.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
 vpx/src/vpx_codec.c
 vpx/src/vpx_decoder.c
 vpx/src/vpx_encoder.c
 vpx/src/vpx_image.c
 vpx/src/vpx_psnr.c
+vpx/vp8.h
 vpx/vp8cx.h
 vpx/vp8dx.h
-vpx/vp8.h
 vpx/vpx_codec.h
 vpx/vpx_codec.mk
 vpx/vpx_decoder.h
@@ -373,3 +316,99 @@
 vpx/vpx_frame_buffer.h
 vpx/vpx_image.h
 vpx/vpx_integer.h
+vpx_config.c
+vpx_dsp/bitreader.c
+vpx_dsp/bitreader.h
+vpx_dsp/bitreader_buffer.c
+vpx_dsp/bitreader_buffer.h
+vpx_dsp/bitwriter.c
+vpx_dsp/bitwriter.h
+vpx_dsp/bitwriter_buffer.c
+vpx_dsp/bitwriter_buffer.h
+vpx_dsp/fwd_txfm.c
+vpx_dsp/fwd_txfm.h
+vpx_dsp/intrapred.c
+vpx_dsp/inv_txfm.c
+vpx_dsp/inv_txfm.h
+vpx_dsp/loopfilter.c
+vpx_dsp/prob.c
+vpx_dsp/prob.h
+vpx_dsp/quantize.c
+vpx_dsp/quantize.h
+vpx_dsp/sad.c
+vpx_dsp/subtract.c
+vpx_dsp/txfm_common.h
+vpx_dsp/variance.c
+vpx_dsp/variance.h
+vpx_dsp/vpx_convolve.c
+vpx_dsp/vpx_convolve.h
+vpx_dsp/vpx_dsp.mk
+vpx_dsp/vpx_dsp_common.h
+vpx_dsp/vpx_dsp_rtcd.c
+vpx_dsp/vpx_dsp_rtcd_defs.pl
+vpx_dsp/vpx_filter.h
+vpx_dsp/x86/convolve.h
+vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+vpx_dsp/x86/fwd_txfm_impl_sse2.h
+vpx_dsp/x86/fwd_txfm_sse2.c
+vpx_dsp/x86/fwd_txfm_sse2.h
+vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+vpx_dsp/x86/intrapred_sse2.asm
+vpx_dsp/x86/intrapred_ssse3.asm
+vpx_dsp/x86/inv_txfm_sse2.c
+vpx_dsp/x86/inv_txfm_sse2.h
+vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+vpx_dsp/x86/inv_wht_sse2.asm
+vpx_dsp/x86/loopfilter_mmx.asm
+vpx_dsp/x86/loopfilter_sse2.c
+vpx_dsp/x86/quantize_sse2.c
+vpx_dsp/x86/quantize_ssse3_x86_64.asm
+vpx_dsp/x86/sad4d_sse2.asm
+vpx_dsp/x86/sad_mmx.asm
+vpx_dsp/x86/sad_sse2.asm
+vpx_dsp/x86/sad_sse3.asm
+vpx_dsp/x86/sad_ssse3.asm
+vpx_dsp/x86/ssim_opt_x86_64.asm
+vpx_dsp/x86/subpel_variance_sse2.asm
+vpx_dsp/x86/subtract_sse2.asm
+vpx_dsp/x86/txfm_common_sse2.h
+vpx_dsp/x86/variance_impl_mmx.asm
+vpx_dsp/x86/variance_mmx.c
+vpx_dsp/x86/variance_sse2.c
+vpx_dsp/x86/vpx_asm_stubs.c
+vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+vpx_mem/include/vpx_mem_intrnl.h
+vpx_mem/vpx_mem.c
+vpx_mem/vpx_mem.h
+vpx_mem/vpx_mem.mk
+vpx_ports/bitops.h
+vpx_ports/emmintrin_compat.h
+vpx_ports/emms.asm
+vpx_ports/mem.h
+vpx_ports/mem_ops.h
+vpx_ports/mem_ops_aligned.h
+vpx_ports/msvc.h
+vpx_ports/system_state.h
+vpx_ports/vpx_once.h
+vpx_ports/vpx_ports.mk
+vpx_ports/vpx_timer.h
+vpx_ports/x86.h
+vpx_ports/x86_abi_support.asm
+vpx_scale/generic/gen_scalers.c
+vpx_scale/generic/vpx_scale.c
+vpx_scale/generic/yv12config.c
+vpx_scale/generic/yv12extend.c
+vpx_scale/vpx_scale.h
+vpx_scale/vpx_scale.mk
+vpx_scale/vpx_scale_rtcd.c
+vpx_scale/vpx_scale_rtcd.pl
+vpx_scale/yv12config.h
+vpx_util/endian_inl.h
+vpx_util/vpx_thread.c
+vpx_util/vpx_thread.h
+vpx_util/vpx_util.mk

diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
new file mode 100644
index 0000000..fc714f4
--- /dev/null
+++ b/config/x86_64/vp8_rtcd.h

@@ -0,0 +1,294 @@
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_ssse3
+
+void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+
+void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+
+void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3
+
+void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_b vp8_blend_b_c
+
+void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_mb_inner vp8_blend_mb_inner_c
+
+void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+int vp8_block_error_mmx(short *coeff, short *dqcoeff);
+int vp8_block_error_xmm(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_xmm
+
+void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
+#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3
+
+void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
+#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_ssse3
+
+void vp8_clear_system_state_c();
+void vpx_reset_mmx_state();
+#define vp8_clear_system_state vpx_reset_mmx_state
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+#define vp8_copy32xn vp8_copy32xn_sse3
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
+
+void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_sse2
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
+
+void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+#define vp8_dequantize_b vp8_dequantize_b_mmx
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sadx4
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_sse2(struct block *, struct blockd *);
+void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_ssse3
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
+#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
+
+int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_full_search_sad vp8_full_search_sadx3
+
+void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
+#define vp8_intra4x4_predict vp8_intra4x4_predict_c
+
+void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
+
+void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
+
+void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
+
+void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
+
+void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
+
+void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+int vp8_mbblock_error_mmx(struct macroblock *mb, int dc);
+int vp8_mbblock_error_xmm(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_xmm
+
+void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+#define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm
+
+int vp8_mbuverror_c(struct macroblock *mb);
+int vp8_mbuverror_mmx(struct macroblock *mb);
+int vp8_mbuverror_xmm(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_xmm
+
+void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
+#define vp8_plane_add_noise vp8_plane_add_noise_wmt
+
+void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
+#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
+
+void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_mmx(short *input, short *output);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
+
+void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_ssse3
+
+void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_ssse3
+
+void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_ssse3
+
+void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_ssse3
+
+void vp8_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h
new file mode 100644
index 0000000..67cfa5e
--- /dev/null
+++ b/config/x86_64/vp9_rtcd.h

@@ -0,0 +1,150 @@
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_avg_4x4_c(const uint8_t *, int p);
+unsigned int vp9_avg_4x4_sse2(const uint8_t *, int p);
+#define vp9_avg_4x4 vp9_avg_4x4_sse2
+
+unsigned int vp9_avg_8x8_c(const uint8_t *, int p);
+unsigned int vp9_avg_8x8_sse2(const uint8_t *, int p);
+#define vp9_avg_8x8 vp9_avg_8x8_sse2
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_sse2
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_sse2
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_sse2
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_sse2
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_sse2
+
+int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_full_range_search vp9_full_range_search_c
+
+int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
+#define vp9_full_search_sad vp9_full_search_sadx3
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fwht4x4_mmx(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_mmx
+
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_16x16 vp9_hadamard_16x16_sse2
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, int16_t *coeff);
+void vp9_hadamard_8x8_ssse3(int16_t const *src_diff, int src_stride, int16_t *coeff);
+#define vp9_hadamard_8x8 vp9_hadamard_8x8_ssse3
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width);
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width);
+#define vp9_int_pro_col vp9_int_pro_col_sse2
+
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height);
+#define vp9_int_pro_row vp9_int_pro_row_sse2
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vp9_minmax_8x8 vp9_minmax_8x8_sse2
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_ssse3
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
+
+int16_t vp9_satd_c(const int16_t *coeff, int length);
+int16_t vp9_satd_sse2(const int16_t *coeff, int length);
+#define vp9_satd vp9_satd_sse2
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl);
+#define vp9_vector_var vp9_vector_var_sse2
+
+void vp9_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
new file mode 100644
index 0000000..6f0800b
--- /dev/null
+++ b/config/x86_64/vpx_config.asm

@@ -0,0 +1,85 @@
+%define ARCH_ARM 0
+%define ARCH_MIPS 0
+%define ARCH_X86 0
+%define ARCH_X86_64 1
+%define HAVE_EDSP 0
+%define HAVE_MEDIA 0
+%define HAVE_NEON 0
+%define HAVE_NEON_ASM 0
+%define HAVE_MIPS32 0
+%define HAVE_DSPR2 0
+%define HAVE_MSA 0
+%define HAVE_MIPS64 0
+%define HAVE_MMX 1
+%define HAVE_SSE 1
+%define HAVE_SSE2 1
+%define HAVE_SSE3 1
+%define HAVE_SSSE3 1
+%define HAVE_SSE4_1 0
+%define HAVE_AVX 0
+%define HAVE_AVX2 0
+%define HAVE_VPX_PORTS 1
+%define HAVE_STDINT_H 1
+%define HAVE_PTHREAD_H 1
+%define HAVE_SYS_MMAN_H 1
+%define HAVE_UNISTD_H 1
+%define CONFIG_DEPENDENCY_TRACKING 1
+%define CONFIG_EXTERNAL_BUILD 1
+%define CONFIG_INSTALL_DOCS 1
+%define CONFIG_INSTALL_BINS 1
+%define CONFIG_INSTALL_LIBS 1
+%define CONFIG_INSTALL_SRCS 0
+%define CONFIG_USE_X86INC 1
+%define CONFIG_DEBUG 0
+%define CONFIG_GPROF 0
+%define CONFIG_GCOV 0
+%define CONFIG_RVCT 0
+%define CONFIG_GCC 1
+%define CONFIG_MSVS 0
+%define CONFIG_PIC 1
+%define CONFIG_BIG_ENDIAN 0
+%define CONFIG_CODEC_SRCS 0
+%define CONFIG_DEBUG_LIBS 0
+%define CONFIG_DEQUANT_TOKENS 0
+%define CONFIG_DC_RECON 0
+%define CONFIG_RUNTIME_CPU_DETECT 0
+%define CONFIG_POSTPROC 1
+%define CONFIG_VP9_POSTPROC 0
+%define CONFIG_MULTITHREAD 1
+%define CONFIG_INTERNAL_STATS 0
+%define CONFIG_VP8_ENCODER 1
+%define CONFIG_VP8_DECODER 1
+%define CONFIG_VP9_ENCODER 1
+%define CONFIG_VP9_DECODER 1
+%define CONFIG_VP10_ENCODER 0
+%define CONFIG_VP10_DECODER 0
+%define CONFIG_VP8 1
+%define CONFIG_VP9 1
+%define CONFIG_VP10 0
+%define CONFIG_ENCODERS 1
+%define CONFIG_DECODERS 1
+%define CONFIG_STATIC_MSVCRT 0
+%define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_REALTIME_ONLY 1
+%define CONFIG_ONTHEFLY_BITPACKING 0
+%define CONFIG_ERROR_CONCEALMENT 0
+%define CONFIG_SHARED 0
+%define CONFIG_STATIC 1
+%define CONFIG_SMALL 0
+%define CONFIG_POSTPROC_VISUALIZER 0
+%define CONFIG_OS_SUPPORT 1
+%define CONFIG_UNIT_TESTS 1
+%define CONFIG_WEBM_IO 1
+%define CONFIG_LIBYUV 1
+%define CONFIG_DECODE_PERF_TESTS 0
+%define CONFIG_ENCODE_PERF_TESTS 0
+%define CONFIG_MULTI_RES_ENCODING 0
+%define CONFIG_TEMPORAL_DENOISING 1
+%define CONFIG_VP9_TEMPORAL_DENOISING 0
+%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_EXPERIMENTAL 0
+%define CONFIG_SIZE_LIMIT 0
+%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_FP_MB_STATS 0
+%define CONFIG_EMULATE_HARDWARE 0

diff --git a/generic/vpx_config.c b/config/x86_64/vpx_config.c
similarity index 67%
copy from generic/vpx_config.c
copy to config/x86_64/vpx_config.c
index 61f4fd8..2561717 100644
--- a/generic/vpx_config.c
+++ b/config/x86_64/vpx_config.c

@@ -5,5 +5,6 @@
 /* tree. An additional intellectual property rights grant can be found */
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=generic-gnu --disable-examples --disable-docs --enable-realtime-only";
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect";
 const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/x86/vpx_config.h b/config/x86_64/vpx_config.h
similarity index 85%
copy from x86/vpx_config.h
copy to config/x86_64/vpx_config.h
index dd5114d..8796347 100644
--- a/x86/vpx_config.h
+++ b/config/x86_64/vpx_config.h

@@ -9,35 +9,34 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __inline__ __attribute__((always_inline))
+#define INLINE      inline
 #define ARCH_ARM 0
 #define ARCH_MIPS 0
-#define ARCH_X86 1
-#define ARCH_X86_64 0
-#define ARCH_PPC32 0
-#define ARCH_PPC64 0
+#define ARCH_X86 0
+#define ARCH_X86_64 1
 #define HAVE_EDSP 0
 #define HAVE_MEDIA 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
 #define HAVE_MMX 1
 #define HAVE_SSE 1
 #define HAVE_SSE2 1
-#define HAVE_SSE3 0
-#define HAVE_SSSE3 0
+#define HAVE_SSE3 1
+#define HAVE_SSSE3 1
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
-#define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 1
-#define HAVE_ALT_TREE_LAYOUT 0
 #define HAVE_PTHREAD_H 1
 #define HAVE_SYS_MMAN_H 1
 #define HAVE_UNISTD_H 1
-#define CONFIG_EXTERNAL_BUILD 0
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
 #define CONFIG_INSTALL_DOCS 1
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
@@ -53,10 +52,6 @@
 #define CONFIG_BIG_ENDIAN 0
 #define CONFIG_CODEC_SRCS 0
 #define CONFIG_DEBUG_LIBS 0
-#define CONFIG_FAST_UNALIGNED 1
-#define CONFIG_MEM_MANAGER 0
-#define CONFIG_MEM_TRACKER 0
-#define CONFIG_MEM_CHECKS 0
 #define CONFIG_DEQUANT_TOKENS 0
 #define CONFIG_DC_RECON 0
 #define CONFIG_RUNTIME_CPU_DETECT 0
@@ -68,8 +63,11 @@
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_ENCODER 1
 #define CONFIG_VP9_DECODER 1
+#define CONFIG_VP10_ENCODER 0
+#define CONFIG_VP10_DECODER 0
 #define CONFIG_VP8 1
 #define CONFIG_VP9 1
+#define CONFIG_VP10 0
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_STATIC_MSVCRT 0
@@ -82,17 +80,19 @@
 #define CONFIG_SMALL 0
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
-#define CONFIG_UNIT_TESTS 0
+#define CONFIG_UNIT_TESTS 1
 #define CONFIG_WEBM_IO 1
 #define CONFIG_LIBYUV 1
 #define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_ENCODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_SPATIAL_SVC 0
-#define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
 #endif /* VPX_CONFIG_H */

diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000..e78d8ef
--- /dev/null
+++ b/config/x86_64/vpx_dsp_rtcd.h

@@ -0,0 +1,900 @@
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_ssse3
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_ssse3
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_ssse3
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_sse2
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_sse2
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_ssse3
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_ssse3
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_ssse3
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_ssse3
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_ssse3
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_ssse3
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_ssse3
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_ssse3
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_ssse3
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_ssse3
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_ssse3
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_ssse3
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_ssse3
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_ssse3
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_ssse3
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_ssse3
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_sse2
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_sse2
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_sse2
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_sse2
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_sse2
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_sse2
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_sse2
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_ssse3
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_sse2
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_sse2
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+unsigned int vpx_get_mb_ss_mmx(const int16_t *);
+unsigned int vpx_get_mb_ss_sse2(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_sse2
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_ssse3
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_ssse3
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_ssse3
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_ssse3
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_sse2
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_ssse3
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_sse2
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_mmx
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_sse2
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_sse2
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_sse2
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_sse2
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_sse2
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_mmx
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_sse2
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_sse2
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_mmx(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_sse2
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_sse2
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_sse2
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_sse2
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_ssse3
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_sse2
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
+
+void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x3 vpx_sad16x16x3_ssse3
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
+
+void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x16x8 vpx_sad16x16x8_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_sse2
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_sse2
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
+
+void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x3 vpx_sad16x8x3_ssse3
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
+
+void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x8 vpx_sad16x8x8_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_sse2
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_sse2
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
+
+void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x3 vpx_sad32x32x3_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
+
+void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x32x8 vpx_sad32x32x8_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_sse2
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_sse
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_sse
+
+void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x3 vpx_sad4x4x3_sse3
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_sse
+
+void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x8 vpx_sad4x4x8_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_sse
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_sse(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_sse
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_sse
+
+void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x8 vpx_sad4x8x8_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_sse2
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_sse2
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
+
+void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x3 vpx_sad64x64x3_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
+
+void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x64x8 vpx_sad64x64x8_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_sse2
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
+
+void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x3 vpx_sad8x16x3_sse3
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
+
+void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x8 vpx_sad8x16x8_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_sse2
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
+
+void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x8 vpx_sad8x4x8_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_mmx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_sse2
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
+
+void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x3 vpx_sad8x8x3_sse3
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
+
+void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x8 vpx_sad8x8x8_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_ssse3
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_ssse3
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_ssse3
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_ssse3
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_ssse3
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_ssse3
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_ssse3
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_ssse3
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_ssse3
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_ssse3
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_ssse3
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_ssse3
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_ssse3
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_ssse3
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_ssse3
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_ssse3
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_sse2
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_sse2
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_sse2
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_sse2
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_sse2
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_sse2
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_sse2
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_sse2
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_sse2
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_sse2
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_sse2
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_sse2
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_sse2
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_sse2
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_h vpx_variance_halfpixvar16x16_h_mmx
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_hv vpx_variance_halfpixvar16x16_hv_mmx
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse);
+#define vpx_variance_halfpixvar16x16_v vpx_variance_halfpixvar16x16_v_mmx
+
+void vpx_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/x86/vpx_scale_rtcd.h b/config/x86_64/vpx_scale_rtcd.h
similarity index 90%
copy from x86/vpx_scale_rtcd.h
copy to config/x86_64/vpx_scale_rtcd.h
index 7487e5f..ddf7d01 100644
--- a/x86/vpx_scale_rtcd.h
+++ b/config/x86_64/vpx_scale_rtcd.h

@@ -7,12 +7,12 @@
 #define RTCD_EXTERN extern
 #endif
 
+struct yv12_buffer_config;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct yv12_buffer_config;
-
 void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
 #define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
 
@@ -40,11 +40,11 @@
 void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
 
-void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
 
-void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
-#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c

diff --git a/x86/vpx_version.h b/config/x86_64/vpx_version.h
similarity index 60%
copy from x86/vpx_version.h
copy to config/x86_64/vpx_version.h
index f60dcc7..bce0381 100644
--- a/x86/vpx_version.h
+++ b/config/x86_64/vpx_version.h

@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  3
+#define VERSION_MINOR  4
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0"
-#define VERSION_STRING      " v1.3.0"
+#define VERSION_STRING_NOSP "v1.4.0"
+#define VERSION_STRING      " v1.4.0"

diff --git a/generate_config.sh b/generate_config.sh
new file mode 100755
index 0000000..6c6adae
--- /dev/null
+++ b/generate_config.sh

@@ -0,0 +1,254 @@
+#!/bin/bash -e
+#
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This script is used to generate files in the <platform> directories needed to
+# build libvpx. Every time libvpx source code is updated run this script.
+#
+# For example:
+# $ ./generate_config.sh
+#
+# And this will update all the config files needed.
+
+export LC_ALL=C
+BASE_DIR=$(pwd)
+LIBVPX_SRC_DIR="libvpx"
+LIBVPX_CONFIG_DIR="config"
+
+# Clean files from previous make.
+function make_clean {
+  make clean > /dev/null
+  rm -f libvpx_srcs.txt
+}
+
+# Lint a pair of vpx_config.h and vpx_config.asm to make sure they match.
+# $1 - Header file directory.
+function lint_config {
+  # mips does not contain any assembly so the header does not need to be
+  # compared to the asm.
+  if [[ "$1" != *mips* ]]; then
+    $BASE_DIR/lint_config.sh \
+      -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \
+      -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm
+  fi
+}
+
+# Print the configuration.
+# $1 - Header file directory.
+function print_config {
+  $BASE_DIR/lint_config.sh -p \
+    -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \
+    -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm
+}
+
+# Print the configuration from Header file.
+# This function is an abridged version of print_config which does not use
+# lint_config and it does not require existence of vpx_config.asm.
+# $1 - Header file directory.
+function print_config_basic {
+  combined_config="$(cat $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \
+                   | grep -E ' +[01] *$')"
+  combined_config="$(echo "$combined_config" | grep -v DO1STROUNDING)"
+  combined_config="$(echo "$combined_config" | sed 's/[ \t]//g')"
+  combined_config="$(echo "$combined_config" | sed 's/.*define//')"
+  combined_config="$(echo "$combined_config" | sed 's/0$/=no/')"
+  combined_config="$(echo "$combined_config" | sed 's/1$/=yes/')"
+  echo "$combined_config" | sort | uniq
+}
+
+# Generate *_rtcd.h files.
+# $1 - Header file directory.
+# $2 - Architecture.
+function gen_rtcd_header {
+  echo "Generate $LIBVPX_CONFIG_DIR/$1/*_rtcd.h files."
+
+  # We don't properly persist the config options specificed on the configure
+  # line. Until that is fixed, force them here.
+  DISABLE_CONFIG="--disable-sse4_1 --disable-avx --disable-avx2"
+
+  rm -rf $BASE_DIR/$TEMP_DIR/libvpx.config
+  if [[ "$2" == *mips* ]]; then
+    print_config_basic $1 > $BASE_DIR/$TEMP_DIR/libvpx.config
+  else
+    $BASE_DIR/lint_config.sh -p \
+      -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \
+      -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm \
+      -o $BASE_DIR/$TEMP_DIR/libvpx.config
+  fi
+
+  $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \
+    --arch=$2 \
+    --sym=vp8_rtcd \
+    $DISABLE_CONFIG \
+    --config=$BASE_DIR/$TEMP_DIR/libvpx.config \
+    $BASE_DIR/$LIBVPX_SRC_DIR/vp8/common/rtcd_defs.pl \
+    > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vp8_rtcd.h
+
+  $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \
+    --arch=$2 \
+    --sym=vp9_rtcd \
+    --config=$BASE_DIR/$TEMP_DIR/libvpx.config \
+    $DISABLE_CONFIG \
+    $BASE_DIR/$LIBVPX_SRC_DIR/vp9/common/vp9_rtcd_defs.pl \
+    > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vp9_rtcd.h
+
+  $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \
+    --arch=$2 \
+    --sym=vpx_scale_rtcd \
+    --config=$BASE_DIR/$TEMP_DIR/libvpx.config \
+    $DISABLE_CONFIG \
+    $BASE_DIR/$LIBVPX_SRC_DIR/vpx_scale/vpx_scale_rtcd.pl \
+    > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_scale_rtcd.h
+
+  $BASE_DIR/$LIBVPX_SRC_DIR/build/make/rtcd.pl \
+    --arch=$2 \
+    --sym=vpx_dsp_rtcd \
+    --config=$BASE_DIR/$TEMP_DIR/libvpx.config \
+    $DISABLE_CONFIG \
+    $BASE_DIR/$LIBVPX_SRC_DIR/vpx_dsp/vpx_dsp_rtcd_defs.pl \
+    > $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_dsp_rtcd.h
+
+  rm -rf $BASE_DIR/$TEMP_DIR/libvpx.config
+}
+
+# Generate Config files. "--enable-external-build" must be set to skip
+# detection of capabilities on specific targets.
+# $1 - Header file directory.
+# $2 - Config command line.
+function gen_config_files {
+  ./configure $2 > /dev/null
+
+  # Generate vpx_config.asm. Do not create one for mips.
+  if [[ "$1" != *mips* ]]; then
+    if [[ "$1" == *x86* ]]; then
+      egrep "#define [A-Z0-9_]+ [01]" vpx_config.h \
+        | awk '{print "%define " $2 " " $3}' > vpx_config.asm
+    else
+      egrep "#define [A-Z0-9_]+ [01]" vpx_config.h \
+        | awk '{print $2 " EQU " $3}' \
+        | perl $BASE_DIR/$LIBVPX_SRC_DIR/build/make/ads2gas.pl > vpx_config.asm
+    fi
+  fi
+
+  # Generate vpx_version.h
+  $BASE_DIR/$LIBVPX_SRC_DIR/build/make/version.sh "$BASE_DIR/$LIBVPX_SRC_DIR" vpx_version.h
+
+  cp vpx_config.* vpx_version.h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1
+  make_clean
+  rm -rf vpx_config.* vpx_version.h
+}
+
+echo "Create temporary directory."
+TEMP_DIR="$LIBVPX_SRC_DIR.temp"
+rm -rf $TEMP_DIR
+cp -R $LIBVPX_SRC_DIR $TEMP_DIR
+cd $TEMP_DIR
+
+echo "Generate config files."
+all_platforms="--enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect"
+intel="--disable-sse4_1 --disable-avx --disable-avx2 --as=yasm"
+gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}"
+gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}"
+gen_config_files arm "--target=armv6-linux-gcc  ${all_platforms}"
+gen_config_files arm-neon "--target=armv7-linux-gcc ${all_platforms}"
+gen_config_files arm64 "--force-target=armv8-linux-gcc ${all_platforms}"
+gen_config_files mips32 "--target=mips32-linux-gcc --disable-dspr2 ${all_platforms}"
+gen_config_files mips32-dspr2 "--target=mips32-linux-gcc --enable-dspr2 ${all_platforms}"
+gen_config_files mips64 "--target=mips64-linux-gcc ${all_platforms}"
+gen_config_files generic "--target=generic-gnu ${all_platforms}"
+
+echo "Remove temporary directory."
+cd $BASE_DIR
+rm -rf $TEMP_DIR
+
+echo "Lint libvpx configuration."
+lint_config x86
+lint_config x86_64
+lint_config arm
+lint_config arm-neon
+lint_config arm64
+lint_config mips32
+lint_config mips32-dspr2
+lint_config mips64
+lint_config generic
+
+echo "Create temporary directory."
+TEMP_DIR="$LIBVPX_SRC_DIR.temp"
+rm -rf $TEMP_DIR
+cp -R $LIBVPX_SRC_DIR $TEMP_DIR
+cd $TEMP_DIR
+
+gen_rtcd_header x86 x86
+gen_rtcd_header x86_64 x86_64
+gen_rtcd_header arm armv6
+gen_rtcd_header arm-neon armv7
+gen_rtcd_header arm64 armv8
+gen_rtcd_header mips32 mips32
+gen_rtcd_header mips32-dspr2 mips32
+gen_rtcd_header mips64 mips64
+gen_rtcd_header generic generic
+
+echo "Prepare Makefile."
+./configure --target=generic-gnu > /dev/null
+make_clean
+
+echo "Generate X86 source list."
+config=$(print_config x86)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/x86/
+
+echo "Generate X86_64 source list."
+config=$(print_config x86_64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/x86_64/
+
+echo "Generate ARM source list."
+config=$(print_config arm)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/arm/
+
+echo "Generate ARM NEON source list."
+config=$(print_config arm-neon)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/arm-neon/
+
+echo "Generate ARM64 source list."
+config=$(print_config arm64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/arm64/
+
+echo "Generate MIPS source list."
+config=$(print_config_basic mips32)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/mips32/
+
+echo "Generate MIPS DSPR2 source list."
+config=$(print_config_basic mips32-dspr2)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/mips32-dspr2/
+
+echo "Generate MIPS64 source list."
+config=$(print_config_basic mips64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/mips64/
+
+echo "Generate GENERIC source list."
+config=$(print_config_basic generic)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+cp libvpx_srcs.txt $BASE_DIR/$LIBVPX_CONFIG_DIR/generic/
+
+
+echo "Remove temporary directory."
+cd $BASE_DIR
+rm -rf $TEMP_DIR

diff --git a/generic/vp8_rtcd.h b/generic/vp8_rtcd.h
deleted file mode 100644
index 45ba8fa..0000000
--- a/generic/vp8_rtcd.h
+++ /dev/null

@@ -1,313 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_c
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
-void vp8_clear_system_state_c();
-#define vp8_clear_system_state vp8_clear_system_state_c
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_c
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_c
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_c
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_c
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c
-
-unsigned int vp8_get_mb_ss_c(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_c
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_c
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_c
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_c
-
-int vp8_mbuverror_c(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_c
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_c
-
-void vp8_quantize_mb_c(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_c
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_c
-
-void vp8_quantize_mby_c(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_c
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_c
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_c
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_c
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_c
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_c
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_c
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_c
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_c
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_c
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_c
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_c
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_c
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_c
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_c
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_c
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
-void vp8_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h
deleted file mode 100644
index c2df3fb..0000000
--- a/generic/vp9_rtcd.h
+++ /dev/null

@@ -1,690 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_c
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_c
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_c
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_c
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_c
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_c
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_c
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_c
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_c
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_c
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_c
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_c
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_c
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_c
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_c
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_c
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_c
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_c
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_c
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_c
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_c
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_c
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_c
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_c
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_c
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_c
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_c
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_c
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_c
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_c
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_c
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_c
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_c
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_c
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_c
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_c
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_c
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_c
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_c
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_c
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_c
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_c
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_c
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_c
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_c
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_c
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_c
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_c
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_c
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_c
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_c
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_c
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_c
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_c
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_c
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_c
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_c
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_c
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_c
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_c
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_c
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_c
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_c
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_c
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_c
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_c
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_c
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_c
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_c
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_c
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_c
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_c
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_c
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_c
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_c
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_c
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_c
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_c
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_c
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_c
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_c
-
-void vp9_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/generic/vpx_version.h b/generic/vpx_version.h
deleted file mode 100644
index 59adf99..0000000
--- a/generic/vpx_version.h
+++ /dev/null

@@ -1,7 +0,0 @@
-#define VERSION_MAJOR  1
-#define VERSION_MINOR  3
-#define VERSION_PATCH  0
-#define VERSION_EXTRA  "3825-gd4a47a6"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0-3825-gd4a47a6"
-#define VERSION_STRING      " v1.3.0-3825-gd4a47a6"

diff --git a/libvpx-asm-translation.mk b/libvpx-asm-translation.mk
new file mode 100644
index 0000000..4f73165
--- /dev/null
+++ b/libvpx-asm-translation.mk

@@ -0,0 +1,24 @@
+# Rules to generate GAS compatible assembly from RVCT syntax files.
+# Input variables:
+#   libvpx_2nd_arch
+#   libvpx_source_dir
+#   libvpx_codec_srcs_asm_<arch>
+#
+# Output variables:
+#   LOCAL_GENERATED_SOURCES_<arch>
+
+ifneq ($(strip $(libvpx_codec_srcs_asm_$(TARGET_$(libvpx_2nd_arch)ARCH))),)
+libvpx_intermediates := $(call local-intermediates-dir,,$(libvpx_2nd_arch))
+# This step is only required for ARM. MIPS uses intrinsics exclusively and x86
+# requires 'yasm' to pre-process its assembly files.
+# The ARM assembly sources must be converted from ADS to GAS compatible format.
+VPX_ASM := $(addprefix $(libvpx_intermediates)/, $(libvpx_codec_srcs_asm_$(TARGET_$(libvpx_2nd_arch)ARCH)))
+# The build system will only accept arm assembly which ends in '.s'
+VPX_GEN := $(addsuffix .s, $(VPX_ASM))
+$(VPX_GEN) : PRIVATE_SOURCE_DIR := $(libvpx_source_dir)
+$(VPX_GEN) : PRIVATE_CUSTOM_TOOL = cat $< | perl $(PRIVATE_SOURCE_DIR)/build/make/ads2gas.pl > $@
+$(VPX_GEN) : $(libvpx_intermediates)/%.s : $(libvpx_source_dir)/%
+	$(transform-generated-source)
+
+LOCAL_GENERATED_SOURCES_$(TARGET_$(libvpx_2nd_arch)ARCH) += $(VPX_GEN)
+endif

diff --git a/libvpx-offsets.mk b/libvpx-offsets.mk
deleted file mode 100644
index 826520c..0000000
--- a/libvpx-offsets.mk
+++ /dev/null

@@ -1,69 +0,0 @@
-# Rules to generate assembly.
-# Input variables:
-#   libvpx_2nd_arch
-#   libvpx_source_dir
-#   libvpx_config_dir_<arch>
-#   libvpx_codec_srcs_asm_<arch>
-#
-# Output variables:
-#   LOCAL_GENERATED_SOURCES_<arch>
-#   LOCAL_C_INCLUDES_<arch>
-#
-
-# ARM and x86 use an 'offsets' file in the assembly. It is generated by
-# tricking the compiler and generating non-functional output which is then
-# processed with grep. For ARM, this must be additionally converted from
-# RVCT (ARM's in-house compiler) format to GNU Assembler Format for gcc.
-
-# Offset files are currently used in vpx_scale for NEON and some encoder
-# functions used in both ARM and x86. These files can not be compiled and need
-# to be named accordingly to avoid auto-build rules. The encoder files are not
-# used yet but are included in the comments for future reference.
-
-libvpx_asm_offsets_intermediates := \
-    vp8/encoder/vp8_asm_enc_offsets.intermediate \
-    vpx_scale/vpx_scale_asm_offsets.intermediate \
-
-libvpx_asm_offsets_files := \
-    vp8/encoder/vp8_asm_enc_offsets.asm \
-    vpx_scale/vpx_scale_asm_offsets.asm \
-
-libvpx_intermediates := $(call local-intermediates-dir,,$(libvpx_2nd_arch))
-# Build the S files with inline assembly.
-COMPILE_TO_S := $(addprefix $(libvpx_intermediates)/, $(libvpx_asm_offsets_intermediates))
-$(COMPILE_TO_S) : PRIVATE_2ND_ARCH := $(libvpx_2nd_arch)
-$(COMPILE_TO_S) : PRIVATE_INTERMEDIATES := $(libvpx_intermediates)
-$(COMPILE_TO_S) : PRIVATE_SOURCE_DIR := $(libvpx_source_dir)
-$(COMPILE_TO_S) : PRIVATE_CONFIG_DIR := $(libvpx_config_dir_$(TARGET_$(libvpx_2nd_arch)ARCH))
-$(COMPILE_TO_S) : PRIVATE_CUSTOM_TOOL = $($(PRIVATE_2ND_ARCH)TARGET_CC) -S $(addprefix -I, $($(PRIVATE_2ND_ARCH)TARGET_C_INCLUDES)) -I $(PRIVATE_INTERMEDIATES) -I $(PRIVATE_SOURCE_DIR) -I $(PRIVATE_CONFIG_DIR) -DINLINE_ASM -o $@ $<
-$(COMPILE_TO_S) : $(libvpx_intermediates)/%.intermediate : $(libvpx_source_dir)/%.c
-	$(transform-generated-source)
-
-# Extract the offsets from the inline assembly.
-OFFSETS_GEN := $(addprefix $(libvpx_intermediates)/, $(libvpx_asm_offsets_files))
-$(OFFSETS_GEN) : PRIVATE_OFFSET_PATTERN := '^[a-zA-Z0-9_]* EQU'
-$(OFFSETS_GEN) : PRIVATE_SOURCE_DIR := $(libvpx_source_dir)
-$(OFFSETS_GEN) : PRIVATE_CUSTOM_TOOL = grep $(PRIVATE_OFFSET_PATTERN) $< | tr -d '$$\#' | perl $(PRIVATE_SOURCE_DIR)/build/make/ads2gas.pl > $@
-$(OFFSETS_GEN) : %.asm : %.intermediate
-	$(transform-generated-source)
-
-LOCAL_GENERATED_SOURCES_$(TARGET_$(libvpx_2nd_arch)ARCH) += $(OFFSETS_GEN)
-
-ifneq ($(strip $(libvpx_codec_srcs_asm_$(TARGET_$(libvpx_2nd_arch)ARCH))),)
-# This step is only required for ARM. MIPS uses intrinsics and x86 requires an
-# assembler to pre-process its assembly files.
-# The ARM assembly sources must be converted from ADS to GAS compatible format.
-VPX_GEN := $(addprefix $(libvpx_intermediates)/, $(libvpx_codec_srcs_asm_$(TARGET_$(libvpx_2nd_arch)ARCH)))
-$(VPX_GEN) : PRIVATE_SOURCE_DIR := $(libvpx_source_dir)
-$(VPX_GEN) : PRIVATE_CUSTOM_TOOL = cat $< | perl $(PRIVATE_SOURCE_DIR)/build/make/ads2gas.pl > $@
-$(VPX_GEN) : $(libvpx_intermediates)/%.s : $(libvpx_source_dir)/%
-	$(transform-generated-source)
-
-LOCAL_GENERATED_SOURCES_$(TARGET_$(libvpx_2nd_arch)ARCH) += $(VPX_GEN)
-endif
-
-LOCAL_C_INCLUDES_$(TARGET_$(libvpx_2nd_arch)ARCH) += \
-    $(libvpx_intermediates)/vp8/common \
-    $(libvpx_intermediates)/vp8/decoder \
-    $(libvpx_intermediates)/vp8/encoder \
-    $(libvpx_intermediates)/vpx_scale

diff --git a/libvpx.mk b/libvpx.mk
index af19650..f1f194f 100644
--- a/libvpx.mk
+++ b/libvpx.mk

@@ -29,14 +29,14 @@
 LOCAL_SRC_FILES_$(TARGET_ARCH) := $(libvpx_codec_srcs_c_$(TARGET_ARCH))
 LOCAL_C_INCLUDES_$(TARGET_ARCH) := $(libvpx_config_dir_$(TARGET_ARCH))
 libvpx_2nd_arch :=
-include $(LOCAL_PATH)/libvpx-offsets.mk
+include $(LOCAL_PATH)/libvpx-asm-translation.mk
 
 ifdef TARGET_2ND_ARCH
 include $(LOCAL_PATH)/config.$(TARGET_2ND_ARCH).mk
 LOCAL_SRC_FILES_$(TARGET_2ND_ARCH) := $(libvpx_codec_srcs_c_$(TARGET_2ND_ARCH))
 LOCAL_C_INCLUDES_$(TARGET_2ND_ARCH) := $(libvpx_config_dir_$(TARGET_2ND_ARCH))
 libvpx_2nd_arch := $(TARGET_2ND_ARCH_VAR_PREFIX)
-include $(LOCAL_PATH)/libvpx-offsets.mk
+include $(LOCAL_PATH)/libvpx-asm-translation.mk
 libvpx_2nd_arch :=
 endif
 

diff --git a/libvpx/.mailmap b/libvpx/.mailmap
index fb82a24..0bfda12 100644
--- a/libvpx/.mailmap
+++ b/libvpx/.mailmap

@@ -1,18 +1,26 @@
 Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>

diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index a9aa481..2f63d7c 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS

@@ -3,10 +3,11 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation

diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index 97c9a7b..b0d3064 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG

@@ -1,3 +1,31 @@
+xxxx-yy-zz v1.4.0 "Changes for next release"
+  vpxenc is changed to use VP9 by default.
+  Encoder controls added for 1 pass SVC.
+  Decoder control to toggle on/off loopfilter.
+
+2015-04-03 v1.4.0 "Indian Runner Duck"
+  This release includes significant improvements to the VP9 codec.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.3.0. It drops the compatibility
+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+    controls for VP9.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Multithreaded VP9 decoding (tile and frame-based)
+    Multithreaded VP9 encoding - on by default
+    YUV 4:2:2 and 4:4:4 support in VP9
+    10 and 12bit support in VP9
+    64bit ARM support by replacing ARM assembly with intrinsics
+
+  - Bug Fixes:
+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+    files.
+
+  - Known Issues:
+    Frame Parallel decoding fails for segmented and non-420 files.
+
 2013-11-15 v1.3.0 "Forest"
   This release introduces the VP9 codec in a backward-compatible way.
   All existing users of VP8 can continue to use the library without

diff --git a/libvpx/PATENTS b/libvpx/PATENTS
index 79d17d7..caedf60 100644
--- a/libvpx/PATENTS
+++ b/libvpx/PATENTS

@@ -17,7 +17,7 @@
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.

diff --git a/libvpx/README b/libvpx/README
index f9c24ff..979440e 100644
--- a/libvpx/README
+++ b/libvpx/README

@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -47,10 +47,6 @@
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv5te-android-gcc
-    armv5te-linux-rvct
-    armv5te-linux-gcc
-    armv5te-none-rvct
     armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
@@ -63,14 +59,10 @@
     armv7-none-rvct
     armv7-win32-vs11
     armv7-win32-vs12
+    armv7-win32-vs14
     armv7s-darwin-gcc
     mips32-linux-gcc
-    ppc32-darwin8-gcc
-    ppc32-darwin9-gcc
-    ppc32-linux-gcc
-    ppc64-darwin8-gcc
-    ppc64-darwin9-gcc
-    ppc64-linux-gcc
+    mips64-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
@@ -81,6 +73,7 @@
     x86-darwin11-gcc
     x86-darwin12-gcc
     x86-darwin13-gcc
+    x86-darwin14-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
@@ -93,11 +86,14 @@
     x86-win32-vs10
     x86-win32-vs11
     x86-win32-vs12
+    x86-win32-vs14
+    x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
     x86_64-darwin11-gcc
     x86_64-darwin12-gcc
     x86_64-darwin13-gcc
+    x86_64-darwin14-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
@@ -108,12 +104,7 @@
     x86_64-win64-vs10
     x86_64-win64-vs11
     x86_64-win64-vs12
-    universal-darwin8-gcc
-    universal-darwin9-gcc
-    universal-darwin10-gcc
-    universal-darwin11-gcc
-    universal-darwin12-gcc
-    universal-darwin13-gcc
+    x86_64-win64-vs14
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,

diff --git a/libvpx/args.c b/libvpx/args.c
index 9dabc9b..14b0310 100644
--- a/libvpx/args.c
+++ b/libvpx/args.c

@@ -14,9 +14,7 @@
 #include <limits.h>
 #include "args.h"
 
-#ifdef _MSC_VER
-#define snprintf _snprintf
-#endif
+#include "vpx_ports/msvc.h"
 
 #if defined(__GNUC__) && __GNUC__
 extern void die(const char *fmt, ...) __attribute__((noreturn));

diff --git a/libvpx/args.h b/libvpx/args.h
index 04e0acd..1f37151 100644
--- a/libvpx/args.h
+++ b/libvpx/args.h

@@ -51,6 +51,7 @@
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
 int arg_parse_enum_or_int(const struct arg *arg);
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/build/arm-msvs/obj_int_extract.bat b/libvpx/build/arm-msvs/obj_int_extract.bat
deleted file mode 100644
index c0987bc..0000000
--- a/libvpx/build/arm-msvs/obj_int_extract.bat
+++ /dev/null

@@ -1,18 +0,0 @@
-REM   Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM   %1 - Relative path to the directory containing the vp8 and vpx_scale
-REM        source directories.
-REM   %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vpx_scale/vpx_scale_asm_offsets.c"
-%2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"

diff --git a/libvpx/build/make/Android.mk b/libvpx/build/make/Android.mk
index 816334e..df01dec 100644
--- a/libvpx/build/make/Android.mk
+++ b/libvpx/build/make/Android.mk

@@ -43,7 +43,7 @@
 # will remove any NEON dependency.
 
 # To change to building armeabi, run ./libvpx/configure again, but with
-# --target=arm5te-android-gcc and modify the Application.mk file to
+# --target=armv6-android-gcc and modify the Application.mk file to
 # set APP_ABI := armeabi
 #
 # Running ndk-build will build libvpx and include it in your project.
@@ -60,13 +60,15 @@
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),armeabi)
-  include $(CONFIG_DIR)libs-armv5te-android-gcc.mk
+  include $(CONFIG_DIR)libs-armv6-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
   include $(CONFIG_DIR)libs-armv8-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq ($(TARGET_ARCH_ABI),x86)
   include $(CONFIG_DIR)libs-x86-android-gcc.mk
+else ifeq ($(TARGET_ARCH_ABI),x86_64)
+  include $(CONFIG_DIR)libs-x86_64-android-gcc.mk
 else ifeq ($(TARGET_ARCH_ABI),mips)
   include $(CONFIG_DIR)libs-mips-android-gcc.mk
 else
@@ -91,51 +93,8 @@
 # like x86inc.asm and x86_abi_support.asm
 LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
-# -----------------------------------------------------------------------------
-# Template  : asm_offsets_template
-# Arguments : 1: assembly offsets file to be created
-#             2: c file to base assembly offsets on
-# Returns   : None
-# Usage     : $(eval $(call asm_offsets_template,<asmfile>, <srcfile>
-# Rationale : Create offsets at compile time using for structures that are
-#             defined in c, but used in assembly functions.
-# -----------------------------------------------------------------------------
-define asm_offsets_template
-
-_SRC:=$(2)
-_OBJ:=$(ASM_CNV_PATH)/$$(notdir $(2)).S
-
-_FLAGS = $$($$(my)CFLAGS) \
-          $$(call get-src-file-target-cflags,$(2)) \
-          $$(call host-c-includes,$$(LOCAL_C_INCLUDES) $$(CONFIG_DIR)) \
-          $$(LOCAL_CFLAGS) \
-          $$(NDK_APP_CFLAGS) \
-          $$(call host-c-includes,$$($(my)C_INCLUDES)) \
-          -DINLINE_ASM \
-          -S \
-
-_TEXT = "Compile $$(call get-src-file-text,$(2))"
-_CC   = $$(TARGET_CC)
-
-$$(eval $$(call ev-build-file))
-
-$(1) : $$(_OBJ) $(2)
-	@mkdir -p $$(dir $$@)
-	@grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@
-endef
-
-# Use ads2gas script to convert from RVCT format to GAS format.  This
-#  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
-#  to handle removing these
-ifeq ($(CONFIG_VP8_ENCODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
-endif
-ifeq ($(HAVE_NEON_ASM), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm
-endif
-
 .PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
 	@mkdir -p $(dir $@)
 	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
@@ -201,45 +160,44 @@
 
 LOCAL_MODULE := libvpx
 
-LOCAL_LDLIBS := -llog
-
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
 endif
 
 # Add a dependency to force generation of the RTCD files.
+define rtcd_dep_template
+rtcd_dep_template_SRCS := $(addprefix $(LOCAL_PATH)/, $(LOCAL_SRC_FILES))
+rtcd_dep_template_SRCS := $$(rtcd_dep_template_SRCS:.neon=)
 ifeq ($(CONFIG_VP8), yes)
-$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp8_rtcd.h
+$$(rtcd_dep_template_SRCS): vp8_rtcd.h
 endif
 ifeq ($(CONFIG_VP9), yes)
-$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h
+$$(rtcd_dep_template_SRCS): vp9_rtcd.h
 endif
-$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
+ifeq ($(CONFIG_VP10), yes)
+$$(rtcd_dep_template_SRCS): vp10_rtcd.h
+endif
+$$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
+$$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
-ifeq ($(TARGET_ARCH_ABI),x86)
-$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
+ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+$$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
+endef
+
+$(eval $(call rtcd_dep_template))
 
 .PHONY: clean
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
 	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
 	@$(RM) -r $(ASM_CNV_PATH)
 	@$(RM) $(CLEAN-OBJS)
 
-include $(BUILD_SHARED_LIBRARY)
-
-ifeq ($(HAVE_NEON), yes)
-  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm, \
-    $(LIBVPX_PATH)/vpx_scale/vpx_scale_asm_offsets.c))
-endif
-
-ifeq ($(CONFIG_VP8_ENCODER), yes)
-  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c))
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)

diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index ed90397..f1b1cca 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile

@@ -22,8 +22,10 @@
 exampletest: .DEFAULT
 install:: .DEFAULT
 test:: .DEFAULT
+test-no-data-check:: .DEFAULT
 testdata:: .DEFAULT
 utiltest: .DEFAULT
+exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 
 
 # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
@@ -56,13 +58,10 @@
         fi
 endif
 
+# Since we invoke make recursively for multiple targets we need to include the
+# .mk file for the correct target, but only when $(target) is non-empty.
 ifneq ($(target),)
-# Normally, we want to build the filename from the target and the toolchain.
-# This disambiguates from the $(target).mk file that exists in the source tree.
-# However, the toolchain is part of the target in universal builds, so we
-# don't want to include TOOLCHAIN in that case. FAT_ARCHS is used to test
-# if we're in the universal case.
-include $(target)$(if $(FAT_ARCHS),,-$(TOOLCHAIN)).mk
+include $(target)-$(TOOLCHAIN).mk
 endif
 BUILD_ROOT?=.
 VPATH=$(SRC_PATH_BARE)
@@ -116,6 +115,9 @@
 testdata::
 .PHONY: utiltest
 utiltest:
+.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
+test-no-data-check::
+exampletest-no-data-check utiltest-no-data-check:
 
 # Add compiler flags for intrinsic files
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
@@ -146,6 +148,7 @@
 
 $(BUILD_PFX)%.c.o: %.c
 	$(if $(quiet),@echo "    [CC] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.cc.d: %.cc
@@ -155,6 +158,7 @@
 
 $(BUILD_PFX)%.cc.o: %.cc
 	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.cpp.d: %.cpp
@@ -164,6 +168,7 @@
 
 $(BUILD_PFX)%.cpp.o: %.cpp
 	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.asm.d: %.asm
@@ -174,6 +179,7 @@
 
 $(BUILD_PFX)%.asm.o: %.asm
 	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
 $(BUILD_PFX)%.s.d: %.s
@@ -184,12 +190,14 @@
 
 $(BUILD_PFX)%.s.o: %.s
 	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
 .PRECIOUS: %.c.S
 %.c.S: CFLAGS += -DINLINE_ASM
 $(BUILD_PFX)%.c.S: %.c
 	$(if $(quiet),@echo "    [GEN] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
 
 .PRECIOUS: %.asm.s
@@ -217,14 +225,6 @@
 endif
 
 #
-# Rule to extract assembly constants from C sources
-#
-obj_int_extract: build/make/obj_int_extract.c
-	$(if $(quiet),@echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
-CLEAN-OBJS += obj_int_extract
-
-#
 # Utility functions
 #
 pairmap=$(if $(strip $(2)),\
@@ -315,18 +315,15 @@
         $$(filter %.o,$$^) $$(extralibs)
 endef
 
-
-
-define lipo_lib_template
-$(1): $(addsuffix /$(1),$(FAT_ARCHS))
-	$(if $(quiet),@echo "    [LIPO] $$@")
-	$(qexec)libtool -static -o $$@ $$?
-endef
-
-define lipo_bin_template
-$(1): $(addsuffix /$(1),$(FAT_ARCHS))
-	$(if $(quiet),@echo "    [LIPO] $$@")
-	$(qexec)lipo -output $$@ -create $$?
+define dll_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+$(1):
+	$(if $(quiet),@echo "    [LD] $$@")
+	$(qexec)$$(LD) -Zdll $$(LDFLAGS) \
+        -o $$@ \
+        $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE)
 endef
 
 
@@ -340,9 +337,11 @@
 skip_deps := $(filter %clean,$(MAKECMDGOALS))
 skip_deps += $(findstring testdata,$(MAKECMDGOALS))
 ifeq ($(strip $(skip_deps)),)
-  # Older versions of make don't like -include directives with no arguments
-  ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
-    -include $(filter %.d,$(OBJS-yes:.o=.d))
+  ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
+    # Older versions of make don't like -include directives with no arguments
+    ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
+      -include $(filter %.d,$(OBJS-yes:.o=.d))
+    endif
   endif
 endif
 
@@ -383,8 +382,9 @@
 .libs: $(LIBS)
 	@touch $@
 $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib))))
 
 INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
 ifeq ($(MAKECMDGOALS),dist)
@@ -424,11 +424,7 @@
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
-    DIST-SRCS-$(CONFIG_MSVS)  += build/arm-msvs/obj_int_extract.bat
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    # Include obj_int_extract if we use offsets from *_asm_*_offsets
-    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2armasm_ms.pl

diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
old mode 100644
new mode 100755
index ab6687f..688fa12
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh

@@ -14,62 +14,56 @@
 # Logging / Output Functions
 #
 die_unknown(){
-    echo "Unknown option \"$1\"."
-    echo "See $0 --help for available options."
-    clean_temp_files
-    exit 1
+  echo "Unknown option \"$1\"."
+  echo "See $0 --help for available options."
+  clean_temp_files
+  exit 1
 }
 
-
 die() {
-    echo "$@"
-    echo
-    echo "Configuration failed. This could reflect a misconfiguration of your"
-    echo "toolchains, improper options selected, or another problem. If you"
-    echo "don't see any useful error messages above, the next step is to look"
-    echo "at the configure error log file ($logfile) to determine what"
-    echo "configure was trying to do when it died."
-    clean_temp_files
-    exit 1
+  echo "$@"
+  echo
+  echo "Configuration failed. This could reflect a misconfiguration of your"
+  echo "toolchains, improper options selected, or another problem. If you"
+  echo "don't see any useful error messages above, the next step is to look"
+  echo "at the configure error log file ($logfile) to determine what"
+  echo "configure was trying to do when it died."
+  clean_temp_files
+  exit 1
 }
 
-
 log(){
-    echo "$@" >>$logfile
+  echo "$@" >>$logfile
 }
 
-
 log_file(){
-    log BEGIN $1
-    cat -n $1 >>$logfile
-    log END $1
+  log BEGIN $1
+  cat -n $1 >>$logfile
+  log END $1
 }
 
-
 log_echo() {
-    echo "$@"
-    log "$@"
+  echo "$@"
+  log "$@"
 }
 
-
 fwrite () {
-    outfile=$1
-    shift
-    echo "$@" >> ${outfile}
+  outfile=$1
+  shift
+  echo "$@" >> ${outfile}
 }
 
-
 show_help_pre(){
-    for opt in ${CMDLINE_SELECT}; do
-        opt2=`echo $opt | sed -e 's;_;-;g'`
-        if enabled $opt; then
-            eval "toggle_${opt}=\"--disable-${opt2}\""
-        else
-            eval "toggle_${opt}=\"--enable-${opt2} \""
-        fi
-    done
+  for opt in ${CMDLINE_SELECT}; do
+    opt2=`echo $opt | sed -e 's;_;-;g'`
+    if enabled $opt; then
+      eval "toggle_${opt}=\"--disable-${opt2}\""
+    else
+      eval "toggle_${opt}=\"--enable-${opt2} \""
+    fi
+  done
 
-    cat <<EOF
+  cat <<EOF
 Usage: configure [options]
 Options:
 
@@ -89,6 +83,8 @@
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
   ${toggle_gcov}              enable/disable gcov coverage instrumentation
   ${toggle_thumb}             enable/disable building arm assembly in thumb mode
+  ${toggle_dependency_tracking}
+                              disable to speed up one-time build
 
 Install options:
   ${toggle_install_docs}      control whether docs are installed
@@ -100,9 +96,8 @@
 EOF
 }
 
-
 show_help_post(){
-    cat <<EOF
+  cat <<EOF
 
 
 NOTES:
@@ -119,150 +114,137 @@
   exit 1
 }
 
-
 show_targets() {
-    while [ -n "$*" ]; do
-        if [ "${1%%-*}" = "${2%%-*}" ]; then
-            if [ "${2%%-*}" = "${3%%-*}" ]; then
-                printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
-                shift; shift; shift
-            else
-                printf "    %-24s %-24s\n" "$1" "$2"
-                shift; shift
-            fi
-        else
-            printf "    %-24s\n" "$1"
-            shift
-        fi
-    done
+  while [ -n "$*" ]; do
+    if [ "${1%%-*}" = "${2%%-*}" ]; then
+      if [ "${2%%-*}" = "${3%%-*}" ]; then
+        printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
+        shift; shift; shift
+      else
+        printf "    %-24s %-24s\n" "$1" "$2"
+        shift; shift
+      fi
+    else
+      printf "    %-24s\n" "$1"
+      shift
+    fi
+  done
 }
 
-
 show_help() {
-    show_help_pre
-    show_help_post
+  show_help_pre
+  show_help_post
 }
 
 #
 # List Processing Functions
 #
 set_all(){
-    value=$1
-    shift
-    for var in $*; do
-        eval $var=$value
-    done
+  value=$1
+  shift
+  for var in $*; do
+    eval $var=$value
+  done
 }
 
-
 is_in(){
-    value=$1
-    shift
-    for var in $*; do
-        [ $var = $value ] && return 0
-    done
-    return 1
+  value=$1
+  shift
+  for var in $*; do
+    [ $var = $value ] && return 0
+  done
+  return 1
 }
 
-
 add_cflags() {
-    CFLAGS="${CFLAGS} $@"
-    CXXFLAGS="${CXXFLAGS} $@"
+  CFLAGS="${CFLAGS} $@"
+  CXXFLAGS="${CXXFLAGS} $@"
 }
 
-
 add_cflags_only() {
-    CFLAGS="${CFLAGS} $@"
+  CFLAGS="${CFLAGS} $@"
 }
 
-
 add_cxxflags_only() {
-    CXXFLAGS="${CXXFLAGS} $@"
+  CXXFLAGS="${CXXFLAGS} $@"
 }
 
-
 add_ldflags() {
-    LDFLAGS="${LDFLAGS} $@"
+  LDFLAGS="${LDFLAGS} $@"
 }
 
-
 add_asflags() {
-    ASFLAGS="${ASFLAGS} $@"
+  ASFLAGS="${ASFLAGS} $@"
 }
 
-
 add_extralibs() {
-    extralibs="${extralibs} $@"
+  extralibs="${extralibs} $@"
 }
 
 #
 # Boolean Manipulation Functions
 #
 enable_feature(){
-    set_all yes $*
+  set_all yes $*
 }
 
 disable_feature(){
-    set_all no $*
+  set_all no $*
 }
 
 enabled(){
-    eval test "x\$$1" = "xyes"
+  eval test "x\$$1" = "xyes"
 }
 
 disabled(){
-    eval test "x\$$1" = "xno"
+  eval test "x\$$1" = "xno"
 }
 
-
 soft_enable() {
-    for var in $*; do
-        if ! disabled $var; then
-            log_echo "  enabling $var"
-            enable_feature $var
-        fi
-    done
+  for var in $*; do
+    if ! disabled $var; then
+      enabled $var || log_echo "  enabling $var"
+      enable_feature $var
+    fi
+  done
 }
 
 soft_disable() {
-    for var in $*; do
-        if ! enabled $var; then
-            log_echo "  disabling $var"
-            disable_feature $var
-        fi
-    done
+  for var in $*; do
+    if ! enabled $var; then
+      disabled $var || log_echo "  disabling $var"
+      disable_feature $var
+    fi
+  done
 }
 
-
 #
 # Text Processing Functions
 #
 toupper(){
-    echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
+  echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
 
-
 tolower(){
-    echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
+  echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
 }
 
-
 #
 # Temporary File Functions
 #
 source_path=${0%/*}
 enable_feature source_path_used
 if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
-    source_path="`pwd`"
-    disable_feature source_path_used
+  source_path="`pwd`"
+  disable_feature source_path_used
 fi
 
 if test ! -z "$TMPDIR" ; then
-    TMPDIRx="${TMPDIR}"
+  TMPDIRx="${TMPDIR}"
 elif test ! -z "$TEMPDIR" ; then
-    TMPDIRx="${TEMPDIR}"
+  TMPDIRx="${TEMPDIR}"
 else
-    TMPDIRx="/tmp"
+  TMPDIRx="/tmp"
 fi
 RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
 TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
@@ -273,76 +255,77 @@
 TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
 
 clean_temp_files() {
-    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
-    enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
+  rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+  enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
 }
 
 #
 # Toolchain Check Functions
 #
 check_cmd() {
-    enabled external_build && return
-    log "$@"
-    "$@" >>${logfile} 2>&1
+  enabled external_build && return
+  log "$@"
+  "$@" >>${logfile} 2>&1
 }
 
 check_cc() {
-    log check_cc "$@"
-    cat >${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+  log check_cc "$@"
+  cat >${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
 }
 
 check_cxx() {
-    log check_cxx "$@"
-    cat >${TMP_CC}
-    log_file ${TMP_CC}
-    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
+  log check_cxx "$@"
+  cat >${TMP_CC}
+  log_file ${TMP_CC}
+  check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
 }
 
 check_cpp() {
-    log check_cpp "$@"
-    cat > ${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
+  log check_cpp "$@"
+  cat > ${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
 }
 
 check_ld() {
-    log check_ld "$@"
-    check_cc $@ \
-        && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
+  log check_ld "$@"
+  check_cc $@ \
+    && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
 }
 
 check_header(){
-    log check_header "$@"
-    header=$1
-    shift
-    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable_feature $var
-    check_cpp "$@" <<EOF && enable_feature $var
+  log check_header "$@"
+  header=$1
+  shift
+  var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
+  disable_feature $var
+  check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
 }
 
-
 check_cflags() {
-    log check_cflags "$@"
-    check_cc -Werror "$@" <<EOF
+ log check_cflags "$@"
+ check_cc -Werror "$@" <<EOF
 int x;
 EOF
 }
 
 check_cxxflags() {
-    log check_cxxflags "$@"
+  log check_cxxflags "$@"
 
-    # Catch CFLAGS that trigger CXX warnings
-    case "$CXX" in
-      *c++-analyzer|*clang++|*g++*) check_cxx -Werror "$@" <<EOF
+  # Catch CFLAGS that trigger CXX warnings
+  case "$CXX" in
+    *c++-analyzer|*clang++|*g++*)
+      check_cxx -Werror "$@" <<EOF
 int x;
 EOF
       ;;
-      *) check_cxx -Werror "$@" <<EOF
+    *)
+      check_cxx -Werror "$@" <<EOF
 int x;
 EOF
       ;;
@@ -350,82 +333,82 @@
 }
 
 check_add_cflags() {
-    check_cxxflags "$@" && add_cxxflags_only "$@"
-    check_cflags "$@" && add_cflags_only "$@"
+  check_cxxflags "$@" && add_cxxflags_only "$@"
+  check_cflags "$@" && add_cflags_only "$@"
 }
 
 check_add_asflags() {
-    log add_asflags "$@"
-    add_asflags "$@"
+  log add_asflags "$@"
+  add_asflags "$@"
 }
 
 check_add_ldflags() {
-    log add_ldflags "$@"
-    add_ldflags "$@"
+  log add_ldflags "$@"
+  add_ldflags "$@"
 }
 
 check_asm_align() {
-    log check_asm_align "$@"
-    cat >${TMP_ASM} <<EOF
+  log check_asm_align "$@"
+  cat >${TMP_ASM} <<EOF
 section .rodata
 align 16
 EOF
-    log_file ${TMP_ASM}
-    check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
-    readelf -WS ${TMP_O} >${TMP_X}
-    log_file ${TMP_X}
-    if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
-        die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
-    fi
+  log_file ${TMP_ASM}
+  check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
+  readelf -WS ${TMP_O} >${TMP_X}
+  log_file ${TMP_X}
+  if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
+    die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
+  fi
 }
 
 # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
 check_gcc_machine_option() {
-    opt="$1"
-    feature="$2"
-    [ -n "$feature" ] || feature="$opt"
+  opt="$1"
+  feature="$2"
+  [ -n "$feature" ] || feature="$opt"
 
-    if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
-        RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
-    else
-        soft_enable "$feature"
-    fi
+  if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
 }
 
 write_common_config_banner() {
-    print_webm_license config.mk "##" ""
-    echo '# This file automatically generated by configure. Do not edit!' >> config.mk
-    echo "TOOLCHAIN := ${toolchain}" >> config.mk
+  print_webm_license config.mk "##" ""
+  echo '# This file automatically generated by configure. Do not edit!' >> config.mk
+  echo "TOOLCHAIN := ${toolchain}" >> config.mk
 
-    case ${toolchain} in
-        *-linux-rvct)
-            echo "ALT_LIBC := ${alt_libc}" >> config.mk
-            ;;
-    esac
+  case ${toolchain} in
+    *-linux-rvct)
+      echo "ALT_LIBC := ${alt_libc}" >> config.mk
+      ;;
+  esac
 }
 
 write_common_config_targets() {
-    for t in ${all_targets}; do
-        if enabled ${t}; then
-            if enabled universal || enabled child; then
-                fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
-            else
-                fwrite config.mk "ALL_TARGETS += ${t}"
-            fi
-        fi
+  for t in ${all_targets}; do
+    if enabled ${t}; then
+      if enabled child; then
+        fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
+      else
+        fwrite config.mk "ALL_TARGETS += ${t}"
+      fi
+    fi
     true;
-    done
-true
+  done
+  true
 }
 
 write_common_target_config_mk() {
-    saved_CC="${CC}"
-    saved_CXX="${CXX}"
-    enabled ccache && CC="ccache ${CC}"
-    enabled ccache && CXX="ccache ${CXX}"
-    print_webm_license $1 "##" ""
+  saved_CC="${CC}"
+  saved_CXX="${CXX}"
+  enabled ccache && CC="ccache ${CC}"
+  enabled ccache && CXX="ccache ${CXX}"
+  print_webm_license $1 "##" ""
 
-    cat >> $1 << EOF
+  cat >> $1 << EOF
 # This file automatically generated by configure. Do not edit!
 SRC_PATH="$source_path"
 SRC_PATH_BARE=$source_path
@@ -455,83 +438,87 @@
 RTCD_OPTIONS = ${RTCD_OPTIONS}
 EOF
 
-    if enabled rvct; then cat >> $1 << EOF
+  if enabled rvct; then cat >> $1 << EOF
 fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
 EOF
-    else cat >> $1 << EOF
+  else cat >> $1 << EOF
 fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
 EOF
-    fi
+  fi
 
-    print_config_mk ARCH   "${1}" ${ARCH_LIST}
-    print_config_mk HAVE   "${1}" ${HAVE_LIST}
-    print_config_mk CONFIG "${1}" ${CONFIG_LIST}
-    print_config_mk HAVE   "${1}" gnu_strip
+  print_config_mk ARCH   "${1}" ${ARCH_LIST}
+  print_config_mk HAVE   "${1}" ${HAVE_LIST}
+  print_config_mk CONFIG "${1}" ${CONFIG_LIST}
+  print_config_mk HAVE   "${1}" gnu_strip
 
-    enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+  enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
 
-    CC="${saved_CC}"
-    CXX="${saved_CXX}"
+  CC="${saved_CC}"
+  CXX="${saved_CXX}"
 }
 
-
 write_common_target_config_h() {
-    print_webm_license ${TMP_H} "/*" " */"
-    cat >> ${TMP_H} << EOF
+  print_webm_license ${TMP_H} "/*" " */"
+  cat >> ${TMP_H} << EOF
 /* This file automatically generated by configure. Do not edit! */
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    ${RESTRICT}
 #define INLINE      ${INLINE}
 EOF
-    print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
-    print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
-    print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
-    print_config_vars_h   "${TMP_H}" ${VAR_LIST}
-    echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
-    mkdir -p `dirname "$1"`
-    cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+  print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
+  print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
+  print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
+  print_config_vars_h   "${TMP_H}" ${VAR_LIST}
+  echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
+  mkdir -p `dirname "$1"`
+  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
 }
 
 process_common_cmdline() {
-    for opt in "$@"; do
-        optval="${opt#*=}"
-        case "$opt" in
-        --child) enable_feature child
+  for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+      --child)
+        enable_feature child
         ;;
-        --log*)
+      --log*)
         logging="$optval"
         if ! disabled logging ; then
-            enabled logging || logfile="$logging"
+          enabled logging || logfile="$logging"
         else
-            logfile=/dev/null
+          logfile=/dev/null
         fi
         ;;
-        --target=*) toolchain="${toolchain:-${optval}}"
+      --target=*)
+        toolchain="${toolchain:-${optval}}"
         ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
+      --force-target=*)
+        toolchain="${toolchain:-${optval}}"
+        enable_feature force_toolchain
         ;;
-        --cpu)
+      --cpu=*)
+        tune_cpu="$optval"
         ;;
-        --cpu=*) tune_cpu="$optval"
-        ;;
-        --extra-cflags=*)
+      --extra-cflags=*)
         extra_cflags="${optval}"
         ;;
-        --enable-?*|--disable-?*)
+      --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
-            [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
+          [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
           echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
             die_unknown $opt
+          log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
           echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
             die_unknown $opt
+          log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
-        --require-?*)
+      --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
@@ -539,22 +526,22 @@
             die_unknown $opt
         fi
         ;;
-        --force-enable-?*|--force-disable-?*)
+      --force-enable-?*|--force-disable-?*)
         eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
         ${action}_feature $option
         ;;
-        --libc=*)
+      --libc=*)
         [ -d "${optval}" ] || die "Not a directory: ${optval}"
         disable_feature builtin_libc
         alt_libc="${optval}"
         ;;
-        --as=*)
+      --as=*)
         [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
-            || [ "${optval}" = auto ] \
-            || die "Must be yasm, nasm or auto: ${optval}"
+          || [ "${optval}" = auto ] \
+          || die "Must be yasm, nasm or auto: ${optval}"
         alt_as="${optval}"
         ;;
-        --size-limit=*)
+      --size-limit=*)
         w="${optval%%x*}"
         h="${optval##*x}"
         VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
@@ -563,806 +550,838 @@
             || die "Invalid size-limit: too big."
         enable_feature size_limit
         ;;
-        --prefix=*)
+      --prefix=*)
         prefix="${optval}"
         ;;
-        --libdir=*)
+      --libdir=*)
         libdir="${optval}"
         ;;
-        --sdk-path=*)
+      --sdk-path=*)
         [ -d "${optval}" ] || die "Not a directory: ${optval}"
         sdk_path="${optval}"
         ;;
-        --libc|--as|--prefix|--libdir|--sdk-path)
+      --libc|--as|--prefix|--libdir|--sdk-path)
         die "Option ${opt} requires argument"
         ;;
-        --help|-h) show_help
+      --help|-h)
+        show_help
         ;;
-        *) die_unknown $opt
+      *)
+        die_unknown $opt
         ;;
-        esac
-    done
+    esac
+  done
 }
 
 process_cmdline() {
-    for opt do
-        optval="${opt#*=}"
-        case "$opt" in
-        *) process_common_cmdline $opt
+  for opt do
+    optval="${opt#*=}"
+    case "$opt" in
+      *)
+        process_common_cmdline $opt
         ;;
-        esac
-    done
+    esac
+  done
 }
 
-
 post_process_common_cmdline() {
-    prefix="${prefix:-/usr/local}"
-    prefix="${prefix%/}"
-    libdir="${libdir:-${prefix}/lib}"
-    libdir="${libdir%/}"
-    if [ "${libdir#${prefix}}" = "${libdir}" ]; then
-        die "Libdir ${libdir} must be a subdirectory of ${prefix}"
-    fi
+  prefix="${prefix:-/usr/local}"
+  prefix="${prefix%/}"
+  libdir="${libdir:-${prefix}/lib}"
+  libdir="${libdir%/}"
+  if [ "${libdir#${prefix}}" = "${libdir}" ]; then
+    die "Libdir ${libdir} must be a subdirectory of ${prefix}"
+  fi
 }
 
-
 post_process_cmdline() {
-    true;
+  true;
 }
 
 setup_gnu_toolchain() {
-        CC=${CC:-${CROSS}gcc}
-        CXX=${CXX:-${CROSS}g++}
-        AR=${AR:-${CROSS}ar}
-        LD=${LD:-${CROSS}${link_with_cc:-ld}}
-        AS=${AS:-${CROSS}as}
-    STRIP=${STRIP:-${CROSS}strip}
-    NM=${NM:-${CROSS}nm}
-        AS_SFX=.s
-        EXE_SFX=
+  CC=${CC:-${CROSS}gcc}
+  CXX=${CXX:-${CROSS}g++}
+  AR=${AR:-${CROSS}ar}
+  LD=${LD:-${CROSS}${link_with_cc:-ld}}
+  AS=${AS:-${CROSS}as}
+  STRIP=${STRIP:-${CROSS}strip}
+  NM=${NM:-${CROSS}nm}
+  AS_SFX=.s
+  EXE_SFX=
+}
+
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+  xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+    xcodebuild -sdk $1 -version Path 2>/dev/null
 }
 
 process_common_toolchain() {
-    if [ -z "$toolchain" ]; then
-        gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
+  if [ -z "$toolchain" ]; then
+    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
 
-        # detect tgt_isa
-        case "$gcctarget" in
-            armv6*)
-                tgt_isa=armv6
-                ;;
-            armv7*-hardfloat*)
-                tgt_isa=armv7
-                float_abi=hard
-                ;;
-            armv7*)
-                tgt_isa=armv7
-                float_abi=softfp
-                ;;
-            armv5te*)
-                tgt_isa=armv5te
-                ;;
-            *x86_64*|*amd64*)
-                tgt_isa=x86_64
-                ;;
-            *i[3456]86*)
-                tgt_isa=x86
-                ;;
-            *powerpc64*)
-                tgt_isa=ppc64
-                ;;
-            *powerpc*)
-                tgt_isa=ppc32
-                ;;
-            *sparc*)
-                tgt_isa=sparc
-                ;;
-        esac
+    # detect tgt_isa
+    case "$gcctarget" in
+      armv6*)
+        tgt_isa=armv6
+        ;;
+      armv7*-hardfloat*)
+        tgt_isa=armv7
+        float_abi=hard
+        ;;
+      armv7*)
+        tgt_isa=armv7
+        float_abi=softfp
+        ;;
+      *x86_64*|*amd64*)
+        tgt_isa=x86_64
+        ;;
+      *i[3456]86*)
+        tgt_isa=x86
+        ;;
+      *sparc*)
+        tgt_isa=sparc
+        ;;
+    esac
 
-        # detect tgt_os
-        case "$gcctarget" in
-            *darwin8*)
-                tgt_isa=universal
-                tgt_os=darwin8
-                ;;
-            *darwin9*)
-                tgt_isa=universal
-                tgt_os=darwin9
-                ;;
-            *darwin10*)
-                tgt_isa=x86_64
-                tgt_os=darwin10
-                ;;
-            *darwin11*)
-                tgt_isa=x86_64
-                tgt_os=darwin11
-                ;;
-            *darwin12*)
-                tgt_isa=x86_64
-                tgt_os=darwin12
-                ;;
-            *darwin13*)
-                tgt_isa=x86_64
-                tgt_os=darwin13
-                ;;
-            x86_64*mingw32*)
-                tgt_os=win64
-                ;;
-            *mingw32*|*cygwin*)
-                [ -z "$tgt_isa" ] && tgt_isa=x86
-                tgt_os=win32
-                ;;
-            *linux*|*bsd*)
-                tgt_os=linux
-                ;;
-            *solaris2.10)
-                tgt_os=solaris
-                ;;
-            *os2*)
-                tgt_os=os2
-                ;;
-        esac
+    # detect tgt_os
+    case "$gcctarget" in
+      *darwin10*)
+        tgt_isa=x86_64
+        tgt_os=darwin10
+        ;;
+      *darwin11*)
+        tgt_isa=x86_64
+        tgt_os=darwin11
+        ;;
+      *darwin12*)
+        tgt_isa=x86_64
+        tgt_os=darwin12
+        ;;
+      *darwin13*)
+        tgt_isa=x86_64
+        tgt_os=darwin13
+        ;;
+      *darwin14*)
+        tgt_isa=x86_64
+        tgt_os=darwin14
+        ;;
+      x86_64*mingw32*)
+        tgt_os=win64
+        ;;
+      *mingw32*|*cygwin*)
+        [ -z "$tgt_isa" ] && tgt_isa=x86
+        tgt_os=win32
+        ;;
+      *linux*|*bsd*)
+        tgt_os=linux
+        ;;
+      *solaris2.10)
+        tgt_os=solaris
+        ;;
+      *os2*)
+        tgt_os=os2
+        ;;
+    esac
 
-        if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
-            toolchain=${tgt_isa}-${tgt_os}-gcc
-        fi
+    if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
+      toolchain=${tgt_isa}-${tgt_os}-gcc
     fi
+  fi
 
-    toolchain=${toolchain:-generic-gnu}
+  toolchain=${toolchain:-generic-gnu}
 
-    is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
-        || die "Unrecognized toolchain '${toolchain}'"
+  is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
+    || die "Unrecognized toolchain '${toolchain}'"
 
-    enabled child || log_echo "Configuring for target '${toolchain}'"
+  enabled child || log_echo "Configuring for target '${toolchain}'"
 
-    #
-    # Set up toolchain variables
-    #
-    tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
-    tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
-    tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
+  #
+  # Set up toolchain variables
+  #
+  tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
+  tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
+  tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
 
-    # Mark the specific ISA requested as enabled
-    soft_enable ${tgt_isa}
-    enable_feature ${tgt_os}
-    enable_feature ${tgt_cc}
+  # Mark the specific ISA requested as enabled
+  soft_enable ${tgt_isa}
+  enable_feature ${tgt_os}
+  enable_feature ${tgt_cc}
 
-    # Enable the architecture family
-    case ${tgt_isa} in
-        arm*) enable_feature arm;;
-        mips*) enable_feature mips;;
-    esac
+  # Enable the architecture family
+  case ${tgt_isa} in
+    arm*)
+      enable_feature arm
+      ;;
+    mips*)
+      enable_feature mips
+      ;;
+  esac
 
-    # PIC is probably what we want when building shared libs
-    enabled shared && soft_enable pic
+  # PIC is probably what we want when building shared libs
+  enabled shared && soft_enable pic
 
-    # Handle darwin variants. Newer SDKs allow targeting older
-    # platforms, so find the newest SDK available.
-    case ${toolchain} in
-        *-darwin*)
-            if [ -z "${DEVELOPER_DIR}" ]; then
-                DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
-                [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
-            fi
-            if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
-                OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
-                OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
-                OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
-                for v in ${OSX_SDK_VERSIONS}; do
-                    if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
-                        osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
-                    fi
-                done
-            fi
-            ;;
-    esac
+  # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+  IOS_VERSION_MIN="6.0"
 
-    if [ -d "${osx_sdk_dir}" ]; then
+  # Handle darwin variants. Newer SDKs allow targeting older
+  # platforms, so use the newest one available.
+  case ${toolchain} in
+    arm*-darwin*)
+      ios_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${ios_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${ios_sdk_dir}"
+        add_ldflags "-isysroot ${ios_sdk_dir}"
+      fi
+      ;;
+    *-darwin*)
+      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+      if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
         add_ldflags "-isysroot ${osx_sdk_dir}"
-    fi
+      fi
+      ;;
+  esac
 
-    case ${toolchain} in
-        *-darwin8-*)
-            add_cflags  "-mmacosx-version-min=10.4"
-            add_ldflags "-mmacosx-version-min=10.4"
-            ;;
-        *-darwin9-*)
-            add_cflags  "-mmacosx-version-min=10.5"
-            add_ldflags "-mmacosx-version-min=10.5"
-            ;;
-        *-darwin10-*)
-            add_cflags  "-mmacosx-version-min=10.6"
-            add_ldflags "-mmacosx-version-min=10.6"
-            ;;
-        *-darwin11-*)
-            add_cflags  "-mmacosx-version-min=10.7"
-            add_ldflags "-mmacosx-version-min=10.7"
-            ;;
-        *-darwin12-*)
-            add_cflags  "-mmacosx-version-min=10.8"
-            add_ldflags "-mmacosx-version-min=10.8"
-            ;;
-        *-darwin13-*)
-            add_cflags  "-mmacosx-version-min=10.9"
-            add_ldflags "-mmacosx-version-min=10.9"
-            ;;
-        *-iphonesimulator-*)
-            add_cflags  "-miphoneos-version-min=5.0"
-            add_ldflags "-miphoneos-version-min=5.0"
-            osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
-            add_cflags  "-isysroot ${osx_sdk_dir}"
-            add_ldflags "-isysroot ${osx_sdk_dir}"
-            ;;
-    esac
+  case ${toolchain} in
+    *-darwin8-*)
+      add_cflags  "-mmacosx-version-min=10.4"
+      add_ldflags "-mmacosx-version-min=10.4"
+      ;;
+    *-darwin9-*)
+      add_cflags  "-mmacosx-version-min=10.5"
+      add_ldflags "-mmacosx-version-min=10.5"
+      ;;
+    *-darwin10-*)
+      add_cflags  "-mmacosx-version-min=10.6"
+      add_ldflags "-mmacosx-version-min=10.6"
+      ;;
+    *-darwin11-*)
+      add_cflags  "-mmacosx-version-min=10.7"
+      add_ldflags "-mmacosx-version-min=10.7"
+      ;;
+    *-darwin12-*)
+      add_cflags  "-mmacosx-version-min=10.8"
+      add_ldflags "-mmacosx-version-min=10.8"
+      ;;
+    *-darwin13-*)
+      add_cflags  "-mmacosx-version-min=10.9"
+      add_ldflags "-mmacosx-version-min=10.9"
+      ;;
+    *-darwin14-*)
+      add_cflags  "-mmacosx-version-min=10.10"
+      add_ldflags "-mmacosx-version-min=10.10"
+      ;;
+    *-iphonesimulator-*)
+      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+      if [ -d "${iossim_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iossim_sdk_dir}"
+        add_ldflags "-isysroot ${iossim_sdk_dir}"
+      fi
+      ;;
+  esac
 
-    # Handle Solaris variants. Solaris 10 needs -lposix4
-    case ${toolchain} in
-        sparc-solaris-*)
-            add_extralibs -lposix4
-            disable_feature fast_unaligned
-            ;;
-        *-solaris-*)
-            add_extralibs -lposix4
-            ;;
-    esac
+  # Handle Solaris variants. Solaris 10 needs -lposix4
+  case ${toolchain} in
+    sparc-solaris-*)
+      add_extralibs -lposix4
+      ;;
+    *-solaris-*)
+      add_extralibs -lposix4
+      ;;
+  esac
 
-    # Process ARM architecture variants
-    case ${toolchain} in
+  # Process ARM architecture variants
+  case ${toolchain} in
     arm*)
-        # on arm, isa versions are supersets
-        case ${tgt_isa} in
+      # on arm, isa versions are supersets
+      case ${tgt_isa} in
         arm64|armv8)
-            soft_enable neon
-            ;;
+          soft_enable neon
+          ;;
         armv7|armv7s)
-            soft_enable neon
-            soft_enable neon_asm
-            soft_enable media
-            soft_enable edsp
-            soft_enable fast_unaligned
-            ;;
+          soft_enable neon
+          # Only enable neon_asm when neon is also enabled.
+          enabled neon && soft_enable neon_asm
+          # If someone tries to force it through, die.
+          if disabled neon && enabled neon_asm; then
+            die "Disabling neon while keeping neon-asm is not supported"
+          fi
+          case ${toolchain} in
+            *-darwin*)
+              # Neon is guaranteed on iOS 6+ devices, while old media extensions
+              # no longer assemble with iOS 9 SDK
+              ;;
+            *)
+              soft_enable media
+          esac
+          ;;
         armv6)
-            soft_enable media
-            soft_enable edsp
-            soft_enable fast_unaligned
-            ;;
-        armv5te)
-            soft_enable edsp
-            disable_feature fast_unaligned
-            ;;
-        esac
+          soft_enable media
+          ;;
+      esac
 
-        asm_conversion_cmd="cat"
+      asm_conversion_cmd="cat"
 
-        case ${tgt_cc} in
+      case ${tgt_cc} in
         gcc)
-            CROSS=${CROSS:-arm-none-linux-gnueabi-}
-            link_with_cc=gcc
-            setup_gnu_toolchain
-            arch_int=${tgt_isa##armv}
-            arch_int=${arch_int%%te}
-            check_add_asflags --defsym ARCHITECTURE=${arch_int}
-            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
-                if [ -z "${float_abi}" ]; then
-                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
+          CROSS=${CROSS:-arm-none-linux-gnueabi-}
+          link_with_cc=gcc
+          setup_gnu_toolchain
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --defsym ARCHITECTURE=${arch_int}
+          tune_cflags="-mtune="
+          if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+            if [ -z "${float_abi}" ]; then
+              check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
 #error "not hardfp"
 #endif
 EOF
-                fi
-                check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
-                check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
-
-                if enabled neon || enabled neon_asm
-                then
-                    check_add_cflags -mfpu=neon #-ftree-vectorize
-                    check_add_asflags -mfpu=neon
-                fi
-
-                if [ -z "${tune_cpu}" ]; then
-                    tune_cpu=cortex-a8
-                fi
-            else
-                check_add_cflags -march=${tgt_isa}
-                check_add_asflags -march=${tgt_isa}
             fi
+            check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
+            check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
 
-            enabled debug && add_asflags -g
-            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
-            if enabled thumb; then
-                asm_conversion_cmd="$asm_conversion_cmd -thumb"
-                check_add_cflags -mthumb
-                check_add_asflags -mthumb -mimplicit-it=always
+            if enabled neon || enabled neon_asm; then
+              check_add_cflags -mfpu=neon #-ftree-vectorize
+              check_add_asflags -mfpu=neon
             fi
-            ;;
+          else
+            check_add_cflags -march=${tgt_isa}
+            check_add_asflags -march=${tgt_isa}
+          fi
+
+          enabled debug && add_asflags -g
+          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+          if enabled thumb; then
+            asm_conversion_cmd="$asm_conversion_cmd -thumb"
+            check_add_cflags -mthumb
+            check_add_asflags -mthumb -mimplicit-it=always
+          fi
+          ;;
         vs*)
-            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-            AS_SFX=.s
-            msvs_arch_dir=arm-msvs
-            disable_feature multithread
-            disable_feature unit_tests
-            vs_version=${tgt_cc##vs}
-            if [ $vs_version -ge 12 ]; then
-                # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
-                # only "AppContainerApplication" which requires an AppxManifest.
-                # Therefore disable the examples, just build the library.
-                disable_feature examples
-            fi
-            ;;
+          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+          AS_SFX=.s
+          msvs_arch_dir=arm-msvs
+          disable_feature multithread
+          disable_feature unit_tests
+          vs_version=${tgt_cc##vs}
+          if [ $vs_version -ge 12 ]; then
+            # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
+            # only "AppContainerApplication" which requires an AppxManifest.
+            # Therefore disable the examples, just build the library.
+            disable_feature examples
+          fi
+          ;;
         rvct)
-            CC=armcc
-            AR=armar
-            AS=armasm
-            LD="${source_path}/build/make/armlink_adapter.sh"
-            STRIP=arm-none-linux-gnueabi-strip
-            NM=arm-none-linux-gnueabi-nm
-            tune_cflags="--cpu="
-            tune_asflags="--cpu="
-            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} = "armv7" ]; then
-                    if enabled neon || enabled neon_asm
-                    then
-                        check_add_cflags --fpu=softvfp+vfpv3
-                        check_add_asflags --fpu=softvfp+vfpv3
-                    fi
-                    check_add_cflags --cpu=Cortex-A8
-                    check_add_asflags --cpu=Cortex-A8
-                else
-                    check_add_cflags --cpu=${tgt_isa##armv}
-                    check_add_asflags --cpu=${tgt_isa##armv}
-                fi
+          CC=armcc
+          AR=armar
+          AS=armasm
+          LD="${source_path}/build/make/armlink_adapter.sh"
+          STRIP=arm-none-linux-gnueabi-strip
+          NM=arm-none-linux-gnueabi-nm
+          tune_cflags="--cpu="
+          tune_asflags="--cpu="
+          if [ -z "${tune_cpu}" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
+              if enabled neon || enabled neon_asm
+              then
+                check_add_cflags --fpu=softvfp+vfpv3
+                check_add_asflags --fpu=softvfp+vfpv3
+              fi
+              check_add_cflags --cpu=Cortex-A8
+              check_add_asflags --cpu=Cortex-A8
+            else
+              check_add_cflags --cpu=${tgt_isa##armv}
+              check_add_asflags --cpu=${tgt_isa##armv}
             fi
-            arch_int=${tgt_isa##armv}
-            arch_int=${arch_int%%te}
-            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
-            enabled debug && add_asflags -g
-            add_cflags --gnu
-            add_cflags --enum_is_int
-            add_cflags --wchar32
-        ;;
-        esac
+          fi
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+          enabled debug && add_asflags -g
+          add_cflags --gnu
+          add_cflags --enum_is_int
+          add_cflags --wchar32
+          ;;
+      esac
 
-        case ${tgt_os} in
+      case ${tgt_os} in
         none*)
-            disable_feature multithread
-            disable_feature os_support
-            ;;
+          disable_feature multithread
+          disable_feature os_support
+          ;;
 
         android*)
-            SDK_PATH=${sdk_path}
-            COMPILER_LOCATION=`find "${SDK_PATH}" \
-                               -name "arm-linux-androideabi-gcc*" -print -quit`
-            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
-            CC=${TOOLCHAIN_PATH}gcc
-            CXX=${TOOLCHAIN_PATH}g++
-            AR=${TOOLCHAIN_PATH}ar
-            LD=${TOOLCHAIN_PATH}gcc
-            AS=${TOOLCHAIN_PATH}as
-            STRIP=${TOOLCHAIN_PATH}strip
-            NM=${TOOLCHAIN_PATH}nm
+          SDK_PATH=${sdk_path}
+          COMPILER_LOCATION=`find "${SDK_PATH}" \
+                             -name "arm-linux-androideabi-gcc*" -print -quit`
+          TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+          CC=${TOOLCHAIN_PATH}gcc
+          CXX=${TOOLCHAIN_PATH}g++
+          AR=${TOOLCHAIN_PATH}ar
+          LD=${TOOLCHAIN_PATH}gcc
+          AS=${TOOLCHAIN_PATH}as
+          STRIP=${TOOLCHAIN_PATH}strip
+          NM=${TOOLCHAIN_PATH}nm
 
-            if [ -z "${alt_libc}" ]; then
-                alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
-                          awk '{n = split($0,a,"/"); \
-                                split(a[n-1],b,"-"); \
-                                print $0 " " b[2]}' | \
-                          sort -g -k 2 | \
-                          awk '{ print $1 }' | tail -1`
-            fi
+          if [ -z "${alt_libc}" ]; then
+            alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+              awk '{n = split($0,a,"/"); \
+                split(a[n-1],b,"-"); \
+                print $0 " " b[2]}' | \
+                sort -g -k 2 | \
+                awk '{ print $1 }' | tail -1`
+          fi
 
-            add_cflags "--sysroot=${alt_libc}"
-            add_ldflags "--sysroot=${alt_libc}"
+          add_cflags "--sysroot=${alt_libc}"
+          add_ldflags "--sysroot=${alt_libc}"
 
-            # linker flag that routes around a CPU bug in some
-            # Cortex-A8 implementations (NDK Dev Guide)
-            add_ldflags "-Wl,--fix-cortex-a8"
+          # linker flag that routes around a CPU bug in some
+          # Cortex-A8 implementations (NDK Dev Guide)
+          add_ldflags "-Wl,--fix-cortex-a8"
 
-            enable_feature pic
-            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
-                soft_enable runtime_cpu_detect
-            fi
-            if enabled runtime_cpu_detect; then
-                add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
-            fi
+          enable_feature pic
+          soft_enable realtime_only
+          if [ ${tgt_isa} = "armv7" ]; then
+            soft_enable runtime_cpu_detect
+          fi
+          if enabled runtime_cpu_detect; then
+            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
+          fi
           ;;
 
         darwin*)
+          XCRUN_FIND="xcrun --sdk iphoneos --find"
+          CXX="$(${XCRUN_FIND} clang++)"
+          CC="$(${XCRUN_FIND} clang)"
+          AR="$(${XCRUN_FIND} ar)"
+          AS="$(${XCRUN_FIND} as)"
+          STRIP="$(${XCRUN_FIND} strip)"
+          NM="$(${XCRUN_FIND} nm)"
+          RANLIB="$(${XCRUN_FIND} ranlib)"
+          AS_SFX=.s
 
-            XCRUN_FIND="xcrun --sdk iphoneos -find"
-            CXX="$(${XCRUN_FIND} clang++)"
-            CC="$(${XCRUN_FIND} clang)"
-            AR="$(${XCRUN_FIND} ar)"
+          # Special handling of ld for armv6 because libclang_rt.ios.a does
+          # not contain armv6 support in Apple's clang package:
+          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
+          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
+          # renders support for armv6 unnecessary because the 3GS and up
+          # support neon.
+          if [ "${tgt_isa}" = "armv6" ]; then
             LD="$(${XCRUN_FIND} ld)"
-            AS="$(${XCRUN_FIND} as)"
-            STRIP="$(${XCRUN_FIND} strip)"
-            NM="$(${XCRUN_FIND} nm)"
-            RANLIB="$(${XCRUN_FIND} ranlib)"
-            AS_SFX=.s
+          else
+            LD="${CXX:-$(${XCRUN_FIND} ld)}"
+          fi
 
-            # ASFLAGS is written here instead of using check_add_asflags
-            # because we need to overwrite all of ASFLAGS and purge the
-            # options that were put in above
-            ASFLAGS="-arch ${tgt_isa} -g"
+          # ASFLAGS is written here instead of using check_add_asflags
+          # because we need to overwrite all of ASFLAGS and purge the
+          # options that were put in above
+          ASFLAGS="-arch ${tgt_isa} -g"
 
-            alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
-            add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
-            add_ldflags -arch ${tgt_isa} -ios_version_min 7.0
+          add_cflags -arch ${tgt_isa}
+          add_ldflags -arch ${tgt_isa}
 
-            for d in lib usr/lib usr/lib/system; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-            done
+          alt_libc="$(show_darwin_sdk_path iphoneos)"
+          if [ -d "${alt_libc}" ]; then
+            add_cflags -isysroot ${alt_libc}
+          fi
 
-            asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
-         ;;
+          if [ "${LD}" = "${CXX}" ]; then
+            add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+          else
+            add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+          fi
+
+          for d in lib usr/lib usr/lib/system; do
+            try_dir="${alt_libc}/${d}"
+            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
+          done
+
+          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+          ;;
 
         linux*)
-            enable_feature linux
-            if enabled rvct; then
-                # Check if we have CodeSourcery GCC in PATH. Needed for
-                # libraries
-                hash arm-none-linux-gnueabi-gcc 2>&- || \
-                  die "Couldn't find CodeSourcery GCC from PATH"
+          enable_feature linux
+          if enabled rvct; then
+            # Check if we have CodeSourcery GCC in PATH. Needed for
+            # libraries
+            hash arm-none-linux-gnueabi-gcc 2>&- || \
+              die "Couldn't find CodeSourcery GCC from PATH"
 
-                # Use armcc as a linker to enable translation of
-                # some gcc specific options such as -lm and -lpthread.
-                LD="armcc --translate_gcc"
+            # Use armcc as a linker to enable translation of
+            # some gcc specific options such as -lm and -lpthread.
+            LD="armcc --translate_gcc"
 
-                # create configuration file (uses path to CodeSourcery GCC)
-                armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
+            # create configuration file (uses path to CodeSourcery GCC)
+            armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
 
-                add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-                add_asflags --no_hide_all --apcs=/interwork
-                add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-                enabled pic && add_cflags --apcs=/fpic
-                enabled pic && add_asflags --apcs=/fpic
-                enabled shared && add_cflags --shared
-            fi
-        ;;
-
-        esac
-    ;;
+            add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            add_asflags --no_hide_all --apcs=/interwork
+            add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            enabled pic && add_cflags --apcs=/fpic
+            enabled pic && add_asflags --apcs=/fpic
+            enabled shared && add_cflags --shared
+          fi
+          ;;
+      esac
+      ;;
     mips*)
-        link_with_cc=gcc
-        setup_gnu_toolchain
-        tune_cflags="-mtune="
-        if enabled dspr2; then
-            check_add_cflags -mips32r2 -mdspr2
-            disable_feature fast_unaligned
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      tune_cflags="-mtune="
+      if enabled dspr2; then
+        check_add_cflags -mips32r2 -mdspr2
+      fi
+
+      if enabled runtime_cpu_detect; then
+        disable_feature runtime_cpu_detect
+      fi
+
+      if [ -n "${tune_cpu}" ]; then
+        case ${tune_cpu} in
+          p5600)
+            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -msched-weight -mhard-float -mfp64
+            check_add_asflags -mips32r5 -mhard-float -mfp64
+            check_add_ldflags -mfp64
+            ;;
+          i6400)
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
+            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+            check_add_ldflags -mips64r6 -mabi=64 -mfp64
+            ;;
+        esac
+
+        if enabled msa; then
+          add_cflags -mmsa
+          add_asflags -mmsa
+          add_ldflags -mmsa
         fi
-        check_add_cflags -march=${tgt_isa}
-        check_add_asflags -march=${tgt_isa}
-        check_add_asflags -KPIC
-    ;;
-    ppc*)
-        enable_feature ppc
-        bits=${tgt_isa##ppc}
-        link_with_cc=gcc
-        setup_gnu_toolchain
-        add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-        soft_enable altivec
-        enabled altivec && add_cflags -maltivec
+      fi
 
-        case "$tgt_os" in
-        linux*)
-            add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
-        ;;
-        darwin*)
-            darwin_arch="-arch ppc"
-            enabled ppc64 && darwin_arch="${darwin_arch}64"
-            add_cflags  ${darwin_arch} -m${bits} -fasm-blocks
-            add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-            add_ldflags ${darwin_arch} -m${bits}
-            enabled altivec && add_cflags -faltivec
-        ;;
-        esac
-    ;;
+      check_add_cflags -march=${tgt_isa}
+      check_add_asflags -march=${tgt_isa}
+      check_add_asflags -KPIC
+      ;;
     x86*)
-        case  ${tgt_os} in
-            win*)
-                enabled gcc && add_cflags -fno-common
-                ;;
-            solaris*)
-                CC=${CC:-${CROSS}gcc}
-                CXX=${CXX:-${CROSS}g++}
-                LD=${LD:-${CROSS}gcc}
-                CROSS=${CROSS:-g}
-                ;;
-            os2)
-                AS=${AS:-nasm}
-                ;;
-        esac
+      case  ${tgt_os} in
+        win*)
+          enabled gcc && add_cflags -fno-common
+          ;;
+        solaris*)
+          CC=${CC:-${CROSS}gcc}
+          CXX=${CXX:-${CROSS}g++}
+          LD=${LD:-${CROSS}gcc}
+          CROSS=${CROSS:-g}
+          ;;
+        os2)
+          AS=${AS:-nasm}
+          ;;
+      esac
 
-        AS="${alt_as:-${AS:-auto}}"
-        case  ${tgt_cc} in
-            icc*)
-                CC=${CC:-icc}
-                LD=${LD:-icc}
-                setup_gnu_toolchain
-                add_cflags -use-msasm  # remove -use-msasm too?
-                # add -no-intel-extensions to suppress warning #10237
-                # refer to http://software.intel.com/en-us/forums/topic/280199
-                add_ldflags -i-static -no-intel-extensions
-                enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
-                enabled x86_64 && AR=xiar
-                case ${tune_cpu} in
-                    atom*)
-                        tune_cflags="-x"
-                        tune_cpu="SSE3_ATOM"
-                    ;;
-                    *)
-                        tune_cflags="-march="
-                    ;;
-                esac
-            ;;
-            gcc*)
-                link_with_cc=gcc
-                tune_cflags="-march="
-                setup_gnu_toolchain
-                #for 32 bit x86 builds, -O3 did not turn on this flag
-                enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
-            ;;
-            vs*)
-                # When building with Microsoft Visual Studio the assembler is
-                # invoked directly. Checking at configure time is unnecessary.
-                # Skip the check by setting AS arbitrarily
-                AS=msvs
-                msvs_arch_dir=x86-msvs
-                vc_version=${tgt_cc##vs}
-                case $vc_version in
-                    7|8|9|10)
-                         echo "${tgt_cc} does not support avx/avx2, disabling....."
-                         RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
-                         soft_disable avx
-                         soft_disable avx2
-                    ;;
-                esac
-            ;;
-        esac
+      AS="${alt_as:-${AS:-auto}}"
+      case  ${tgt_cc} in
+        icc*)
+          CC=${CC:-icc}
+          LD=${LD:-icc}
+          setup_gnu_toolchain
+          add_cflags -use-msasm  # remove -use-msasm too?
+          # add -no-intel-extensions to suppress warning #10237
+          # refer to http://software.intel.com/en-us/forums/topic/280199
+          add_ldflags -i-static -no-intel-extensions
+          enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
+          enabled x86_64 && AR=xiar
+          case ${tune_cpu} in
+            atom*)
+              tune_cflags="-x"
+              tune_cpu="SSE3_ATOM"
+              ;;
+            *)
+              tune_cflags="-march="
+              ;;
+          esac
+          ;;
+        gcc*)
+          link_with_cc=gcc
+          tune_cflags="-march="
+          setup_gnu_toolchain
+          #for 32 bit x86 builds, -O3 did not turn on this flag
+          enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
+          ;;
+        vs*)
+          # When building with Microsoft Visual Studio the assembler is
+          # invoked directly. Checking at configure time is unnecessary.
+          # Skip the check by setting AS arbitrarily
+          AS=msvs
+          msvs_arch_dir=x86-msvs
+          vc_version=${tgt_cc##vs}
+          case $vc_version in
+            7|8|9|10)
+              echo "${tgt_cc} does not support avx/avx2, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
+              soft_disable avx
+              soft_disable avx2
+              ;;
+          esac
+          ;;
+      esac
 
-        bits=32
-        enabled x86_64 && bits=64
-        check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
+      bits=32
+      enabled x86_64 && bits=64
+      check_cpp <<EOF && bits=x32
+#if !defined(__ILP32__) || !defined(__x86_64__)
 #error "not x32"
 #endif
 EOF
-        case ${tgt_cc} in
-            gcc*)
-                add_cflags -m${bits}
-                add_ldflags -m${bits}
-            ;;
-        esac
+      case ${tgt_cc} in
+        gcc*)
+          add_cflags -m${bits}
+          add_ldflags -m${bits}
+          ;;
+      esac
 
-        soft_enable runtime_cpu_detect
-        # We can't use 'check_cflags' until the compiler is configured and CC is
-        # populated.
-        check_gcc_machine_option mmx
-        check_gcc_machine_option sse
-        check_gcc_machine_option sse2
-        check_gcc_machine_option sse3
-        check_gcc_machine_option ssse3
-        check_gcc_machine_option sse4 sse4_1
-        check_gcc_machine_option avx
-        check_gcc_machine_option avx2
+      soft_enable runtime_cpu_detect
+      # We can't use 'check_cflags' until the compiler is configured and CC is
+      # populated.
+      check_gcc_machine_option mmx
+      check_gcc_machine_option sse
+      check_gcc_machine_option sse2
+      check_gcc_machine_option sse3
+      check_gcc_machine_option ssse3
+      check_gcc_machine_option sse4 sse4_1
+      check_gcc_machine_option avx
+      check_gcc_machine_option avx2
 
-        case "${AS}" in
-            auto|"")
-                which nasm >/dev/null 2>&1 && AS=nasm
-                which yasm >/dev/null 2>&1 && AS=yasm
-                [ "${AS}" = auto ] || [ -z "${AS}" ] \
-                    && die "Neither yasm nor nasm have been found"
-            ;;
-        esac
-        log_echo "  using $AS"
-        [ "${AS##*/}" = nasm ] && add_asflags -Ox
-        AS_SFX=.asm
-        case  ${tgt_os} in
-            win32)
-                add_asflags -f win32
-                enabled debug && add_asflags -g cv8
-                EXE_SFX=.exe
-            ;;
-            win64)
-                add_asflags -f x64
-                enabled debug && add_asflags -g cv8
-                EXE_SFX=.exe
-            ;;
-            linux*|solaris*|android*)
-                add_asflags -f elf${bits}
-                enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
-                enabled debug && [ "${AS}" = nasm ] && add_asflags -g
-                [ "${AS##*/}" = nasm ] && check_asm_align
-            ;;
-            darwin*)
-                add_asflags -f macho${bits}
-                enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
-                add_cflags  ${darwin_arch}
-                add_ldflags ${darwin_arch}
-                # -mdynamic-no-pic is still a bit of voodoo -- it was required at
-                # one time, but does not seem to be now, and it breaks some of the
-                # code that still relies on inline assembly.
-                # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
-                enabled icc && ! enabled pic && add_cflags -fno-pic
-            ;;
-            iphonesimulator)
-                add_asflags -f macho${bits}
-                enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
-                add_cflags  ${sim_arch}
-                add_ldflags ${sim_arch}
-           ;;
-            os2)
-                add_asflags -f aout
-                enabled debug && add_asflags -g
-                EXE_SFX=.exe
-            ;;
-            *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
-            ;;
-        esac
-    ;;
-    universal*|*-gcc|generic-gnu)
-        link_with_cc=gcc
-        enable_feature gcc
-    setup_gnu_toolchain
-    ;;
-    esac
+      case "${AS}" in
+        auto|"")
+          which nasm >/dev/null 2>&1 && AS=nasm
+          which yasm >/dev/null 2>&1 && AS=yasm
+          if [ "${AS}" = nasm ] ; then
+            # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
+            # this check if they start shipping a compatible version.
+            apple=`nasm -v | grep "Apple"`
+            [ -n "${apple}" ] \
+              && echo "Unsupported version of nasm: ${apple}" \
+              && AS=""
+          fi
+          [ "${AS}" = auto ] || [ -z "${AS}" ] \
+            && die "Neither yasm nor nasm have been found"
+          ;;
+      esac
+      log_echo "  using $AS"
+      [ "${AS##*/}" = nasm ] && add_asflags -Ox
+      AS_SFX=.asm
+      case  ${tgt_os} in
+        win32)
+          add_asflags -f win32
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        win64)
+          add_asflags -f x64
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        linux*|solaris*|android*)
+          add_asflags -f elf${bits}
+          enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
+          enabled debug && [ "${AS}" = nasm ] && add_asflags -g
+          [ "${AS##*/}" = nasm ] && check_asm_align
+          ;;
+        darwin*)
+          add_asflags -f macho${bits}
+          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
+          add_cflags  ${darwin_arch}
+          add_ldflags ${darwin_arch}
+          # -mdynamic-no-pic is still a bit of voodoo -- it was required at
+          # one time, but does not seem to be now, and it breaks some of the
+          # code that still relies on inline assembly.
+          # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
+          enabled icc && ! enabled pic && add_cflags -fno-pic
+          ;;
+        iphonesimulator)
+          add_asflags -f macho${bits}
+          enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
+          add_cflags  ${sim_arch}
+          add_ldflags ${sim_arch}
+          ;;
+        os2)
+          add_asflags -f aout
+          enabled debug && add_asflags -g
+          EXE_SFX=.exe
+          ;;
+        *)
+          log "Warning: Unknown os $tgt_os while setting up $AS flags"
+          ;;
+      esac
+      ;;
+    *-gcc|generic-gnu)
+      link_with_cc=gcc
+      enable_feature gcc
+      setup_gnu_toolchain
+      ;;
+  esac
 
-    # Try to enable CPU specific tuning
-    if [ -n "${tune_cpu}" ]; then
-        if [ -n "${tune_cflags}" ]; then
-            check_add_cflags ${tune_cflags}${tune_cpu} || \
-                die "Requested CPU '${tune_cpu}' not supported by compiler"
-        fi
+  # Try to enable CPU specific tuning
+  if [ -n "${tune_cpu}" ]; then
+    if [ -n "${tune_cflags}" ]; then
+      check_add_cflags ${tune_cflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by compiler"
+    fi
     if [ -n "${tune_asflags}" ]; then
-            check_add_asflags ${tune_asflags}${tune_cpu} || \
-                die "Requested CPU '${tune_cpu}' not supported by assembler"
-        fi
+      check_add_asflags ${tune_asflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by assembler"
+    fi
     if [ -z "${tune_cflags}${tune_asflags}" ]; then
-            log_echo "Warning: CPU tuning not supported by this toolchain"
-        fi
+      log_echo "Warning: CPU tuning not supported by this toolchain"
     fi
+  fi
 
-    if enabled debug; then
-        check_add_cflags -g && check_add_ldflags -g
+  if enabled debug; then
+    check_add_cflags -g && check_add_ldflags -g
+  else
+    check_add_cflags -DNDEBUG
+  fi
+
+  enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
+  enabled gcov &&
+    check_add_cflags -fprofile-arcs -ftest-coverage &&
+    check_add_ldflags -fprofile-arcs -ftest-coverage
+
+  if enabled optimizations; then
+    if enabled rvct; then
+      enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
     else
-        check_add_cflags -DNDEBUG
+      enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
     fi
+  fi
 
-    enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
-    enabled gcov &&
-        check_add_cflags -fprofile-arcs -ftest-coverage &&
-        check_add_ldflags -fprofile-arcs -ftest-coverage
+  if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
+    soft_enable use_x86inc
+  fi
 
-    if enabled optimizations; then
-        if enabled rvct; then
-            enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
-        else
-            enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
-        fi
-    fi
+  # Position Independent Code (PIC) support, for building relocatable
+  # shared objects
+  enabled gcc && enabled pic && check_add_cflags -fPIC
 
-    tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
-    # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
-    # non-Darwin target.
-    if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
-            [ "${tgt_os_no_version}" != "darwin" ]; then
-        soft_enable use_x86inc
-    fi
+  # Work around longjmp interception on glibc >= 2.11, to improve binary
+  # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
+  enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
 
-    # Position Independent Code (PIC) support, for building relocatable
-    # shared objects
-    enabled gcc && enabled pic && check_add_cflags -fPIC
+  # Check for strip utility variant
+  ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
 
-    # Work around longjmp interception on glibc >= 2.11, to improve binary
-    # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
-    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
-
-    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
-
-    # Try to determine target endianness
-    check_cc <<EOF
-    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
+  # Try to determine target endianness
+  check_cc <<EOF
+unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
     [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
         grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-    static inline function() {}
-EOF
-    check_cc <<EOF && INLINE="__inline__ __attribute__((always_inline))"
-    static __attribute__((always_inline)) function() {}
+static inline function() {}
 EOF
 
-    # Almost every platform uses pthreads.
-    if enabled multithread; then
-        case ${toolchain} in
-            *-win*-vs*);;
-            *-android-gcc);;
-            *) check_header pthread.h && add_extralibs -lpthread
-        esac
-    fi
-
-    # only for MIPS platforms
+  # Almost every platform uses pthreads.
+  if enabled multithread; then
     case ${toolchain} in
-        mips*)
-            if enabled dspr2; then
-                if enabled big_endian; then
-                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable_feature dspr2
-                fi
-            fi
+      *-win*-vs*)
+        ;;
+      *-android-gcc)
+        ;;
+      *)
+        check_header pthread.h && add_extralibs -lpthread
         ;;
     esac
+  fi
 
-    # glibc needs these
-    if enabled linux; then
-        add_cflags -D_LARGEFILE_SOURCE
-        add_cflags -D_FILE_OFFSET_BITS=64
-    fi
+  # only for MIPS platforms
+  case ${toolchain} in
+    mips*)
+      if enabled big_endian; then
+        if enabled dspr2; then
+          echo "dspr2 optimizations are available only for little endian platforms"
+          disable_feature dspr2
+        fi
+        if enabled msa; then
+          echo "msa optimizations are available only for little endian platforms"
+          disable_feature msa
+        fi
+      fi
+      ;;
+  esac
 
-    # append any user defined extra cflags
-    if [ -n "${extra_cflags}" ] ; then
-        check_add_cflags ${extra_cflags} || \
-        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
-    fi
+  # glibc needs these
+  if enabled linux; then
+    add_cflags -D_LARGEFILE_SOURCE
+    add_cflags -D_FILE_OFFSET_BITS=64
+  fi
+
+  # append any user defined extra cflags
+  if [ -n "${extra_cflags}" ] ; then
+    check_add_cflags ${extra_cflags} || \
+    die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+  fi
 }
 
 process_toolchain() {
-    process_common_toolchain
+  process_common_toolchain
 }
 
 print_config_mk() {
-    saved_prefix="${prefix}"
-    prefix=$1
-    makefile=$2
-    shift 2
-    for cfg; do
-        if enabled $cfg; then
-            upname="`toupper $cfg`"
-            echo "${prefix}_${upname}=yes" >> $makefile
-        fi
-    done
-    prefix="${saved_prefix}"
+  saved_prefix="${prefix}"
+  prefix=$1
+  makefile=$2
+  shift 2
+  for cfg; do
+    if enabled $cfg; then
+      upname="`toupper $cfg`"
+      echo "${prefix}_${upname}=yes" >> $makefile
+    fi
+  done
+  prefix="${saved_prefix}"
 }
 
 print_config_h() {
-    saved_prefix="${prefix}"
-    prefix=$1
-    header=$2
-    shift 2
-    for cfg; do
-        upname="`toupper $cfg`"
-        if enabled $cfg; then
-            echo "#define ${prefix}_${upname} 1" >> $header
-        else
-            echo "#define ${prefix}_${upname} 0" >> $header
-        fi
-    done
-    prefix="${saved_prefix}"
+  saved_prefix="${prefix}"
+  prefix=$1
+  header=$2
+  shift 2
+  for cfg; do
+    upname="`toupper $cfg`"
+    if enabled $cfg; then
+      echo "#define ${prefix}_${upname} 1" >> $header
+    else
+      echo "#define ${prefix}_${upname} 0" >> $header
+    fi
+  done
+  prefix="${saved_prefix}"
 }
 
 print_config_vars_h() {
-    header=$1
-    shift
-    while [ $# -gt 0 ]; do
-        upname="`toupper $1`"
-        echo "#define ${upname} $2" >> $header
-        shift 2
-    done
+  header=$1
+  shift
+  while [ $# -gt 0 ]; do
+    upname="`toupper $1`"
+    echo "#define ${upname} $2" >> $header
+    shift 2
+  done
 }
 
 print_webm_license() {
-    saved_prefix="${prefix}"
-    destination=$1
-    prefix="$2"
-    suffix="$3"
-    shift 3
-    cat <<EOF > ${destination}
+  saved_prefix="${prefix}"
+  destination=$1
+  prefix="$2"
+  suffix="$3"
+  shift 3
+  cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
 ${prefix} ${suffix}
 ${prefix} Use of this source code is governed by a BSD-style license${suffix}
@@ -1371,43 +1390,43 @@
 ${prefix} in the file PATENTS.  All contributing project authors may${suffix}
 ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
 EOF
-    prefix="${saved_prefix}"
+  prefix="${saved_prefix}"
 }
 
 process_targets() {
-    true;
+  true;
 }
 
 process_detect() {
-    true;
+  true;
 }
 
 enable_feature logging
 logfile="config.log"
 self=$0
 process() {
-    cmdline_args="$@"
-    process_cmdline "$@"
-    if enabled child; then
-        echo "# ${self} $@" >> ${logfile}
-    else
-        echo "# ${self} $@" > ${logfile}
-    fi
-    post_process_common_cmdline
-    post_process_cmdline
-    process_toolchain
-    process_detect
-    process_targets
+  cmdline_args="$@"
+  process_cmdline "$@"
+  if enabled child; then
+    echo "# ${self} $@" >> ${logfile}
+  else
+    echo "# ${self} $@" > ${logfile}
+  fi
+  post_process_common_cmdline
+  post_process_cmdline
+  process_toolchain
+  process_detect
+  process_targets
 
-    OOT_INSTALLS="${OOT_INSTALLS}"
-    if enabled source_path_used; then
-    # Prepare the PWD for building.
-    for f in ${OOT_INSTALLS}; do
-            install -D "${source_path}/$f" "$f"
-    done
-    fi
-    cp "${source_path}/build/make/Makefile" .
+  OOT_INSTALLS="${OOT_INSTALLS}"
+  if enabled source_path_used; then
+  # Prepare the PWD for building.
+  for f in ${OOT_INSTALLS}; do
+    install -D "${source_path}/$f" "$f"
+  done
+  fi
+  cp "${source_path}/build/make/Makefile" .
 
-    clean_temp_files
-    true
+  clean_temp_files
+  true
 }

diff --git a/libvpx/build/make/gen_msvs_proj.sh b/libvpx/build/make/gen_msvs_proj.sh
index 3653309..0cf335b 100755
--- a/libvpx/build/make/gen_msvs_proj.sh
+++ b/libvpx/build/make/gen_msvs_proj.sh

@@ -73,6 +73,10 @@
                 open_tag File RelativePath="$f"
 
                 if [ "$pat" == "asm" ] && $asm_use_custom_step; then
+                    # Avoid object file name collisions, i.e. vpx_config.c and
+                    # vpx_config.asm produce the same object file without
+                    # this additional suffix.
+                    objf=${objf%.obj}_asm.obj
                     for plat in "${platforms[@]}"; do
                         for cfg in Debug Release; do
                             open_tag FileConfiguration \
@@ -245,13 +249,13 @@
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
     ;;
     *) die "Unsupported target $target!"
     ;;
@@ -295,23 +299,8 @@
         case "$target" in
             x86*)
                 case "$name" in
-                    obj_int_extract)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
-                            RuntimeLibrary="$debug_runtime" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="1" \
-                            $warn_64bit \
-                    ;;
                     vpx)
                         tag Tool \
-                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
-                        tag Tool \
                             Name="VCCLCompilerTool" \
                             Optimization="0" \
                             AdditionalIncludeDirectories="$incs" \
@@ -347,11 +336,6 @@
                 case "$target" in
                     x86*)
                         case "$name" in
-                            obj_int_extract)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    GenerateDebugInformation="true" \
-                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \
@@ -400,25 +384,8 @@
         case "$target" in
             x86*)
                 case "$name" in
-                    obj_int_extract)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-                    ;;
                     vpx)
                         tag Tool \
-                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
-                        tag Tool \
                             Name="VCCLCompilerTool" \
                             Optimization="2" \
                             FavorSizeorSpeed="1" \
@@ -456,11 +423,6 @@
                 case "$target" in
                     x86*)
                         case "$name" in
-                            obj_int_extract)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    GenerateDebugInformation="true" \
-                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \

diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index ffa3706..664b404 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh

@@ -19,13 +19,13 @@
     cat <<EOF
 Usage: ${self_basename} [options] file1 [file2 ...]
 
-This script generates a Visual Studio 2005 solution file from a list of project
+This script generates a Visual Studio solution file from a list of project
 files.
 
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8,9,10,11) of visual studio to generate for
+    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -255,7 +255,7 @@
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             [789]|10|11|12)
+             [789]|10|11|12|14)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
@@ -300,12 +300,15 @@
     12) sln_vers="12.00"
        sln_vers_str="Visual Studio 2013"
     ;;
+    14) sln_vers="14.00"
+       sln_vers_str="Visual Studio 2015"
+    ;;
 esac
 case "${vs_ver:-8}" in
     [789])
     sfx=vcproj
     ;;
-    10|11|12)
+    10|11|12|14)
     sfx=vcxproj
     ;;
 esac

diff --git a/libvpx/build/make/gen_msvs_vcxproj.sh b/libvpx/build/make/gen_msvs_vcxproj.sh
index 23ef6a3..182ea28 100755
--- a/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/libvpx/build/make/gen_msvs_vcxproj.sh

@@ -34,7 +34,7 @@
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11,12) of visual studio to generate for
+    --ver=version               Version (10,11,12,14) of visual studio to generate for
     --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
@@ -168,7 +168,7 @@
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                10|11|12)
+                10|11|12|14)
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -218,7 +218,7 @@
 asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
 case "${vs_ver:-11}" in
-    10|11|12)
+    10|11|12|14)
        asm_use_custom_step=$uses_asm
     ;;
 esac
@@ -253,24 +253,18 @@
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     arm*)
-        asm_Debug_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
-        if [ "$name" = "obj_int_extract" ]; then
-            # We don't want to build this tool for the target architecture,
-            # but for an architecture we can run locally during the build.
-            platforms[0]="Win32"
-        else
-            platforms[0]="ARM"
-        fi
+        platforms[0]="ARM"
+        asm_Debug_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
     ;;
     *) die "Unsupported target $target!"
     ;;
@@ -350,6 +344,9 @@
                 # has to enable AppContainerApplication as well.
                 tag_content PlatformToolset v120
             fi
+            if [ "$vs_ver" = "14" ]; then
+                tag_content PlatformToolset v140
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true
@@ -400,23 +397,13 @@
                 if [ "$hostplat" == "ARM" ]; then
                     hostplat=Win32
                 fi
-                open_tag PreBuildEvent
-                tag_content Command "call obj_int_extract.bat &quot;$src_path_bare&quot; $hostplat\\\$(Configuration)"
-                close_tag PreBuildEvent
             fi
             open_tag ClCompile
             if [ "$config" = "Debug" ]; then
                 opt=Disabled
                 runtime=$debug_runtime
                 curlibs=$debug_libs
-                case "$name" in
-                obj_int_extract)
-                    debug=DEBUG
-                    ;;
-                *)
-                    debug=_DEBUG
-                    ;;
-                esac
+                debug=_DEBUG
             else
                 opt=MaxSpeed
                 runtime=$release_runtime
@@ -424,14 +411,7 @@
                 tag_content FavorSizeOrSpeed Speed
                 debug=NDEBUG
             fi
-            case "$name" in
-            obj_int_extract)
-                extradefines=";_CONSOLE"
-                ;;
-            *)
-                extradefines=";$defines"
-                ;;
-            esac
+            extradefines=";$defines"
             tag_content Optimization $opt
             tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)"
             tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)"
@@ -451,10 +431,6 @@
             case "$proj_kind" in
             exe)
                 open_tag Link
-                if [ "$name" != "obj_int_extract" ]; then
-                    tag_content AdditionalDependencies "$curlibs;%(AdditionalDependencies)"
-                    tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)"
-                fi
                 tag_content GenerateDebugInformation true
                 # Console is the default normally, but if
                 # AppContainerApplication is set, we need to override it.

diff --git a/libvpx/build/make/iosbuild.sh b/libvpx/build/make/iosbuild.sh
index fa63a4a..89fa681 100755
--- a/libvpx/build/make/iosbuild.sh
+++ b/libvpx/build/make/iosbuild.sh

@@ -18,15 +18,19 @@
 devnull='> /dev/null 2>&1'
 
 BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+                --disable-examples
+                --disable-libyuv
+                --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 MAKE_JOBS=1
-LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
 TARGETS="arm64-darwin-gcc
-         armv6-darwin-gcc
          armv7-darwin-gcc
          armv7s-darwin-gcc
          x86-iphonesimulator-gcc
@@ -42,8 +46,8 @@
 
   mkdir "${target}"
   cd "${target}"
-  eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
-      --disable-docs ${devnull}
+  eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
   export DIST_DIR
   eval make -j ${MAKE_JOBS} dist ${devnull}
   cd "${old_pwd}"
@@ -58,11 +62,8 @@
     arm64-*)
       echo "__aarch64__"
       ;;
-    armv6-*)
-      echo "__ARM_ARCH_6__"
-      ;;
     armv7-*)
-      echo "__ARM_ARCH_7__"
+      echo "__ARM_ARCH_7A__"
       ;;
     armv7s-*)
       echo "__ARM_ARCH_7S__"
@@ -176,8 +177,13 @@
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
+  local readonly res=$?
   cd "${ORIG_PWD}"
 
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
   if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
     rm -rf "${BUILD_ROOT}"
   fi
@@ -187,14 +193,21 @@
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
     --jobs: Number of make jobs.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+         ${TARGETS}
     --verbose: Output information about the environment and each stage of the
                build.
 EOF
 }
 
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
 vlog() {
   if [ "${VERBOSE}" = "yes" ]; then
     echo "$@"
@@ -206,6 +219,10 @@
 # Parse the command line.
 while [ -n "$1" ]; do
   case "$1" in
+    --extra-configure-args)
+      EXTRA_CONFIGURE_ARGS="$2"
+      shift
+      ;;
     --help)
       iosbuild_usage
       exit
@@ -220,6 +237,10 @@
     --show-build-output)
       devnull=
       ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -235,6 +256,8 @@
 cat << EOF
   BUILD_ROOT=${BUILD_ROOT}
   DIST_DIR=${DIST_DIR}
+  CONFIGURE_ARGS=${CONFIGURE_ARGS}
+  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
   HEADER_DIR=${HEADER_DIR}
   MAKE_JOBS=${MAKE_JOBS}
@@ -247,3 +270,5 @@
 fi
 
 build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+echo "         ${TARGETS}"

diff --git a/libvpx/build/make/msvs_common.sh b/libvpx/build/make/msvs_common.sh
old mode 100644
new mode 100755


diff --git a/libvpx/build/make/obj_int_extract.c b/libvpx/build/make/obj_int_extract.c
deleted file mode 100644
index 2e50f38..0000000
--- a/libvpx/build/make/obj_int_extract.c
+++ /dev/null

@@ -1,857 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-typedef enum {
-  OUTPUT_FMT_PLAIN,
-  OUTPUT_FMT_RVDS,
-  OUTPUT_FMT_GAS,
-  OUTPUT_FMT_C_HEADER,
-} output_fmt_t;
-
-int log_msg(const char *fmt, ...) {
-  int res;
-  va_list ap;
-  va_start(ap, fmt);
-  res = vfprintf(stderr, fmt, ap);
-  va_end(ap);
-  return res;
-}
-
-#if defined(__GNUC__) && __GNUC__
-
-#if defined(FORCE_PARSE_ELF)
-
-#if defined(__MACH__)
-#undef __MACH__
-#endif
-
-#if !defined(__ELF__)
-#define __ELF__
-#endif
-#endif
-
-#if defined(__MACH__)
-
-#include <mach-o/loader.h>
-#include <mach-o/nlist.h>
-
-int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
-  switch (mode) {
-    case OUTPUT_FMT_RVDS:
-      printf("%-40s EQU %5d\n", name, val);
-      return 0;
-    case OUTPUT_FMT_GAS:
-      printf(".set %-40s, %5d\n", name, val);
-      return 0;
-    case OUTPUT_FMT_C_HEADER:
-      printf("#define %-40s %5d\n", name, val);
-      return 0;
-    default:
-      log_msg("Unsupported mode: %d", mode);
-      return 1;
-  }
-}
-
-int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) {
-  int i, j;
-  struct mach_header header;
-  uint8_t *buf = base_buf;
-  int base_data_section = 0;
-  int bits = 0;
-
-  /* We can read in mach_header for 32 and 64 bit architectures
-   * because it's identical to mach_header_64 except for the last
-   * element (uint32_t reserved), which we don't use. Then, when
-   * we know which architecture we're looking at, increment buf
-   * appropriately.
-   */
-  memcpy(&header, buf, sizeof(struct mach_header));
-
-  if (header.magic == MH_MAGIC) {
-    if (header.cputype == CPU_TYPE_ARM
-        || header.cputype == CPU_TYPE_X86) {
-      bits = 32;
-      buf += sizeof(struct mach_header);
-    } else {
-      log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
-      goto bail;
-    }
-  } else if (header.magic == MH_MAGIC_64) {
-    if (header.cputype == CPU_TYPE_X86_64) {
-      bits = 64;
-      buf += sizeof(struct mach_header_64);
-    } else {
-      log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
-      goto bail;
-    }
-  } else {
-    log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
-            MH_MAGIC, MH_MAGIC_64, header.magic);
-    goto bail;
-  }
-
-  if (header.filetype != MH_OBJECT) {
-    log_msg("Bad filetype for object file. Currently only tested for MH_OBJECT.\n");
-    goto bail;
-  }
-
-  for (i = 0; i < header.ncmds; i++) {
-    struct load_command lc;
-
-    memcpy(&lc, buf, sizeof(struct load_command));
-
-    if (lc.cmd == LC_SEGMENT) {
-      uint8_t *seg_buf = buf;
-      struct section s;
-      struct segment_command seg_c;
-
-      memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
-      seg_buf += sizeof(struct segment_command);
-
-      /* Although each section is given it's own offset, nlist.n_value
-       * references the offset of the first section. This isn't
-       * apparent without debug information because the offset of the
-       * data section is the same as the first section. However, with
-       * debug sections mixed in, the offset of the debug section
-       * increases but n_value still references the first section.
-       */
-      if (seg_c.nsects < 1) {
-        log_msg("Not enough sections\n");
-        goto bail;
-      }
-
-      memcpy(&s, seg_buf, sizeof(struct section));
-      base_data_section = s.offset;
-    } else if (lc.cmd == LC_SEGMENT_64) {
-      uint8_t *seg_buf = buf;
-      struct section_64 s;
-      struct segment_command_64 seg_c;
-
-      memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
-      seg_buf += sizeof(struct segment_command_64);
-
-      /* Explanation in LG_SEGMENT */
-      if (seg_c.nsects < 1) {
-        log_msg("Not enough sections\n");
-        goto bail;
-      }
-
-      memcpy(&s, seg_buf, sizeof(struct section_64));
-      base_data_section = s.offset;
-    } else if (lc.cmd == LC_SYMTAB) {
-      if (base_data_section != 0) {
-        struct symtab_command sc;
-        uint8_t *sym_buf = base_buf;
-        uint8_t *str_buf = base_buf;
-
-        memcpy(&sc, buf, sizeof(struct symtab_command));
-
-        if (sc.cmdsize != sizeof(struct symtab_command)) {
-          log_msg("Can't find symbol table!\n");
-          goto bail;
-        }
-
-        sym_buf += sc.symoff;
-        str_buf += sc.stroff;
-
-        for (j = 0; j < sc.nsyms; j++) {
-          /* Location of string is cacluated each time from the
-           * start of the string buffer.  On darwin the symbols
-           * are prefixed by "_", so we bump the pointer by 1.
-           * The target value is defined as an int in *_asm_*_offsets.c,
-           * which is 4 bytes on all targets we currently use.
-           */
-          if (bits == 32) {
-            struct nlist nl;
-            int val;
-
-            memcpy(&nl, sym_buf, sizeof(struct nlist));
-            sym_buf += sizeof(struct nlist);
-
-            memcpy(&val, base_buf + base_data_section + nl.n_value,
-                   sizeof(val));
-            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
-          } else { /* if (bits == 64) */
-            struct nlist_64 nl;
-            int val;
-
-            memcpy(&nl, sym_buf, sizeof(struct nlist_64));
-            sym_buf += sizeof(struct nlist_64);
-
-            memcpy(&val, base_buf + base_data_section + nl.n_value,
-                   sizeof(val));
-            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
-          }
-        }
-      }
-    }
-
-    buf += lc.cmdsize;
-  }
-
-  return 0;
-bail:
-  return 1;
-
-}
-
-#elif defined(__ELF__)
-#include "elf.h"
-
-#define COPY_STRUCT(dst, buf, ofst, sz) do {\
-    if(ofst + sizeof((*(dst))) > sz) goto bail;\
-    memcpy(dst, buf+ofst, sizeof((*(dst))));\
-  } while(0)
-
-#define ENDIAN_ASSIGN(val, memb) do {\
-    if(!elf->le_data) {log_msg("Big Endian data not supported yet!\n");goto bail;}\
-    (val) = (memb);\
-  } while(0)
-
-#define ENDIAN_ASSIGN_IN_PLACE(memb) do {\
-    ENDIAN_ASSIGN(memb, memb);\
-  } while(0)
-
-typedef struct {
-  uint8_t      *buf; /* Buffer containing ELF data */
-  size_t        sz;  /* Buffer size */
-  int           le_data; /* Data is little-endian */
-  unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
-  int           bits; /* 32 or 64 */
-  Elf32_Ehdr    hdr32;
-  Elf64_Ehdr    hdr64;
-} elf_obj_t;
-
-int parse_elf_header(elf_obj_t *elf) {
-  int res;
-  /* Verify ELF Magic numbers */
-  COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
-  res = elf->e_ident[EI_MAG0] == ELFMAG0;
-  res &= elf->e_ident[EI_MAG1] == ELFMAG1;
-  res &= elf->e_ident[EI_MAG2] == ELFMAG2;
-  res &= elf->e_ident[EI_MAG3] == ELFMAG3;
-  res &= elf->e_ident[EI_CLASS] == ELFCLASS32
-         || elf->e_ident[EI_CLASS] == ELFCLASS64;
-  res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
-  if (!res) goto bail;
-
-  elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
-  /* Read in relevant values */
-  if (elf->e_ident[EI_CLASS] == ELFCLASS32) {
-    elf->bits = 32;
-    COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
-  } else { /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
-    elf->bits = 64;
-    COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
-  }
-
-  return 0;
-bail:
-  log_msg("Failed to parse ELF file header");
-  return 1;
-}
-
-int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64) {
-  if (hdr32) {
-    if (idx >= elf->hdr32.e_shnum)
-      goto bail;
-
-    COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
-  } else { /* if (hdr64) */
-    if (idx >= elf->hdr64.e_shnum)
-      goto bail;
-
-    COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
-  }
-
-  return 0;
-bail:
-  return 1;
-}
-
-const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
-  if (elf->bits == 32) {
-    Elf32_Shdr shdr;
-
-    if (parse_elf_section(elf, s_idx, &shdr, NULL)) {
-      log_msg("Failed to parse ELF string table: section %d, index %d\n",
-              s_idx, idx);
-      return "";
-    }
-
-    return (char *)(elf->buf + shdr.sh_offset + idx);
-  } else { /* if (elf->bits == 64) */
-    Elf64_Shdr shdr;
-
-    if (parse_elf_section(elf, s_idx, NULL, &shdr)) {
-      log_msg("Failed to parse ELF string table: section %d, index %d\n",
-              s_idx, idx);
-      return "";
-    }
-
-    return (char *)(elf->buf + shdr.sh_offset + idx);
-  }
-}
-
-int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64) {
-  if (sym32) {
-    COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
-  } else { /* if (sym64) */
-    COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
-  }
-  return 0;
-bail:
-  return 1;
-}
-
-int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
-  elf_obj_t    elf;
-  unsigned int ofst;
-  int          i;
-  Elf32_Off    strtab_off32;
-  Elf64_Off    strtab_off64; /* save String Table offset for later use */
-
-  memset(&elf, 0, sizeof(elf));
-  elf.buf = buf;
-  elf.sz = sz;
-
-  /* Parse Header */
-  if (parse_elf_header(&elf))
-    goto bail;
-
-  if (elf.bits == 32) {
-    Elf32_Shdr shdr;
-    for (i = 0; i < elf.hdr32.e_shnum; i++) {
-      parse_elf_section(&elf, i, &shdr, NULL);
-
-      if (shdr.sh_type == SHT_STRTAB) {
-        char strtsb_name[128];
-
-        strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-        if (!(strcmp(strtsb_name, ".shstrtab"))) {
-          /* log_msg("found section: %s\n", strtsb_name); */
-          strtab_off32 = shdr.sh_offset;
-          break;
-        }
-      }
-    }
-  } else { /* if (elf.bits == 64) */
-    Elf64_Shdr shdr;
-    for (i = 0; i < elf.hdr64.e_shnum; i++) {
-      parse_elf_section(&elf, i, NULL, &shdr);
-
-      if (shdr.sh_type == SHT_STRTAB) {
-        char strtsb_name[128];
-
-        strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-        if (!(strcmp(strtsb_name, ".shstrtab"))) {
-          /* log_msg("found section: %s\n", strtsb_name); */
-          strtab_off64 = shdr.sh_offset;
-          break;
-        }
-      }
-    }
-  }
-
-  /* Parse all Symbol Tables */
-  if (elf.bits == 32) {
-    Elf32_Shdr shdr;
-    for (i = 0; i < elf.hdr32.e_shnum; i++) {
-      parse_elf_section(&elf, i, &shdr, NULL);
-
-      if (shdr.sh_type == SHT_SYMTAB) {
-        for (ofst = shdr.sh_offset;
-             ofst < shdr.sh_offset + shdr.sh_size;
-             ofst += shdr.sh_entsize) {
-          Elf32_Sym sym;
-
-          parse_elf_symbol(&elf, ofst, &sym, NULL);
-
-          /* For all OBJECTS (data objects), extract the value from the
-           * proper data segment.
-           */
-          /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-              log_msg("found data object %s\n",
-                      parse_elf_string_table(&elf,
-                                             shdr.sh_link,
-                                             sym.st_name));
-           */
-
-          if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-              && sym.st_size == 4) {
-            Elf32_Shdr dhdr;
-            int val = 0;
-            char section_name[128];
-
-            parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
-
-            /* For explanition - refer to _MSC_VER version of code */
-            strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
-            /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
-            if (strcmp(section_name, ".bss")) {
-              if (sizeof(val) != sym.st_size) {
-                /* The target value is declared as an int in
-                 * *_asm_*_offsets.c, which is 4 bytes on all
-                 * targets we currently use. Complain loudly if
-                 * this is not true.
-                 */
-                log_msg("Symbol size is wrong\n");
-                goto bail;
-              }
-
-              memcpy(&val,
-                     elf.buf + dhdr.sh_offset + sym.st_value,
-                     sym.st_size);
-            }
-
-            if (!elf.le_data) {
-              log_msg("Big Endian data not supported yet!\n");
-              goto bail;
-            }
-
-            switch (mode) {
-              case OUTPUT_FMT_RVDS:
-                printf("%-40s EQU %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_GAS:
-                printf(".equ %-40s, %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_C_HEADER:
-                printf("#define %-40s %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              default:
-                printf("%s = %d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-            }
-          }
-        }
-      }
-    }
-  } else { /* if (elf.bits == 64) */
-    Elf64_Shdr shdr;
-    for (i = 0; i < elf.hdr64.e_shnum; i++) {
-      parse_elf_section(&elf, i, NULL, &shdr);
-
-      if (shdr.sh_type == SHT_SYMTAB) {
-        for (ofst = shdr.sh_offset;
-             ofst < shdr.sh_offset + shdr.sh_size;
-             ofst += shdr.sh_entsize) {
-          Elf64_Sym sym;
-
-          parse_elf_symbol(&elf, ofst, NULL, &sym);
-
-          /* For all OBJECTS (data objects), extract the value from the
-           * proper data segment.
-           */
-          /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-              log_msg("found data object %s\n",
-                      parse_elf_string_table(&elf,
-                                             shdr.sh_link,
-                                             sym.st_name));
-           */
-
-          if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
-              && sym.st_size == 4) {
-            Elf64_Shdr dhdr;
-            int val = 0;
-            char section_name[128];
-
-            parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
-
-            /* For explanition - refer to _MSC_VER version of code */
-            strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
-            /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
-            if ((strcmp(section_name, ".bss"))) {
-              if (sizeof(val) != sym.st_size) {
-                /* The target value is declared as an int in
-                 * *_asm_*_offsets.c, which is 4 bytes on all
-                 * targets we currently use. Complain loudly if
-                 * this is not true.
-                 */
-                log_msg("Symbol size is wrong\n");
-                goto bail;
-              }
-
-              memcpy(&val,
-                     elf.buf + dhdr.sh_offset + sym.st_value,
-                     sym.st_size);
-            }
-
-            if (!elf.le_data) {
-              log_msg("Big Endian data not supported yet!\n");
-              goto bail;
-            }
-
-            switch (mode) {
-              case OUTPUT_FMT_RVDS:
-                printf("%-40s EQU %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_GAS:
-                printf(".equ %-40s, %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              default:
-                printf("%s = %d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  if (mode == OUTPUT_FMT_RVDS)
-    printf("    END\n");
-
-  return 0;
-bail:
-  log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
-  return 1;
-}
-
-#endif
-#endif /* defined(__GNUC__) && __GNUC__ */
-
-
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
-/*  See "Microsoft Portable Executable and Common Object File Format Specification"
-    for reference.
-*/
-#define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
-#define get_le16(x) ((*(x)) | (*(x+1)) << 8)
-
-int parse_coff(uint8_t *buf, size_t sz) {
-  unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
-  unsigned int sectionrawdata_ptr;
-  unsigned int i;
-  uint8_t *ptr;
-  uint32_t symoffset;
-
-  char **sectionlist;  // this array holds all section names in their correct order.
-  // it is used to check if the symbol is in .bss or .rdata section.
-
-  nsections = get_le16(buf + 2);
-  symtab_ptr = get_le32(buf + 8);
-  symtab_sz = get_le32(buf + 12);
-  strtab_ptr = symtab_ptr + symtab_sz * 18;
-
-  if (nsections > 96) {
-    log_msg("Too many sections\n");
-    return 1;
-  }
-
-  sectionlist = malloc(nsections * sizeof(sectionlist));
-
-  if (sectionlist == NULL) {
-    log_msg("Allocating first level of section list failed\n");
-    return 1;
-  }
-
-  // log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
-
-  /*
-  The size of optional header is always zero for an obj file. So, the section header
-  follows the file header immediately.
-  */
-
-  ptr = buf + 20;     // section header
-
-  for (i = 0; i < nsections; i++) {
-    char sectionname[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-    strncpy(sectionname, ptr, 8);
-    // log_msg("COFF: Parsing section %s\n",sectionname);
-
-    sectionlist[i] = malloc(strlen(sectionname) + 1);
-
-    if (sectionlist[i] == NULL) {
-      log_msg("Allocating storage for %s failed\n", sectionname);
-      goto bail;
-    }
-    strcpy(sectionlist[i], sectionname);
-
-    // check if it's .rdata and is not a COMDAT section.
-    if (!strcmp(sectionname, ".rdata") &&
-        (get_le32(ptr + 36) & 0x1000) == 0) {
-      sectionrawdata_ptr = get_le32(ptr + 20);
-    }
-
-    ptr += 40;
-  }
-
-  // log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
-  // log_msg("COFF: raw data pointer ofset for section .rdata is %u\n", sectionrawdata_ptr);
-
-  /*  The compiler puts the data with non-zero offset in .rdata section, but puts the data with
-      zero offset in .bss section. So, if the data in in .bss section, set offset=0.
-      Note from Wiki: In an object module compiled from C, the bss section contains
-      the local variables (but not functions) that were declared with the static keyword,
-      except for those with non-zero initial values. (In C, static variables are initialized
-      to zero by default.) It also contains the non-local (both extern and static) variables
-      that are also initialized to zero (either explicitly or by default).
-      */
-  // move to symbol table
-  /* COFF symbol table:
-      offset      field
-      0           Name(*)
-      8           Value
-      12          SectionNumber
-      14          Type
-      16          StorageClass
-      17          NumberOfAuxSymbols
-      */
-  ptr = buf + symtab_ptr;
-
-  for (i = 0; i < symtab_sz; i++) {
-    int16_t section = get_le16(ptr + 12); // section number
-
-    if (section > 0 && ptr[16] == 2) {
-      // if(section > 0 && ptr[16] == 3 && get_le32(ptr+8)) {
-
-      if (get_le32(ptr)) {
-        char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-        strncpy(name, ptr, 8);
-        // log_msg("COFF: Parsing symbol %s\n",name);
-        /* The 64bit Windows compiler doesn't prefix with an _.
-         * Check what's there, and bump if necessary
-         */
-        if (name[0] == '_')
-          printf("%-40s EQU ", name + 1);
-        else
-          printf("%-40s EQU ", name);
-      } else {
-        // log_msg("COFF: Parsing symbol %s\n",
-        //        buf + strtab_ptr + get_le32(ptr+4));
-        if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
-          printf("%-40s EQU ",
-                 buf + strtab_ptr + get_le32(ptr + 4) + 1);
-        else
-          printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
-      }
-
-      if (!(strcmp(sectionlist[section - 1], ".bss"))) {
-        symoffset = 0;
-      } else {
-        symoffset = get_le32(buf + sectionrawdata_ptr + get_le32(ptr + 8));
-      }
-
-      // log_msg("      Section: %d\n",section);
-      // log_msg("      Class:   %d\n",ptr[16]);
-      // log_msg("      Address: %u\n",get_le32(ptr+8));
-      // log_msg("      Offset: %u\n", symoffset);
-
-      printf("%5d\n", symoffset);
-    }
-
-    ptr += 18;
-  }
-
-  printf("    END\n");
-
-  for (i = 0; i < nsections; i++) {
-    free(sectionlist[i]);
-  }
-
-  free(sectionlist);
-
-  return 0;
-bail:
-
-  for (i = 0; i < nsections; i++) {
-    free(sectionlist[i]);
-  }
-
-  free(sectionlist);
-
-  return 1;
-}
-#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */
-
-int main(int argc, char **argv) {
-  output_fmt_t mode = OUTPUT_FMT_PLAIN;
-  const char *f;
-  uint8_t *file_buf;
-  int res;
-  FILE *fp;
-  long int file_size;
-
-  if (argc < 2 || argc > 3) {
-    fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-    fprintf(stderr, "  <obj file>\tobject file to parse\n");
-    fprintf(stderr, "Output Formats:\n");
-    fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-    fprintf(stderr, "  rvds - compatible with armasm\n");
-    fprintf(stderr, "  cheader - c/c++ header file\n");
-    goto bail;
-  }
-
-  f = argv[2];
-
-  if (!strcmp(argv[1], "rvds"))
-    mode = OUTPUT_FMT_RVDS;
-  else if (!strcmp(argv[1], "gas"))
-    mode = OUTPUT_FMT_GAS;
-  else if (!strcmp(argv[1], "cheader"))
-    mode = OUTPUT_FMT_C_HEADER;
-  else
-    f = argv[1];
-
-  fp = fopen(f, "rb");
-
-  if (!fp) {
-    perror("Unable to open file");
-    goto bail;
-  }
-
-  if (fseek(fp, 0, SEEK_END)) {
-    perror("stat");
-    goto bail;
-  }
-
-  file_size = ftell(fp);
-  file_buf = malloc(file_size);
-
-  if (!file_buf) {
-    perror("malloc");
-    goto bail;
-  }
-
-  rewind(fp);
-
-  if (fread(file_buf, sizeof(char), file_size, fp) != file_size) {
-    perror("read");
-    goto bail;
-  }
-
-  if (fclose(fp)) {
-    perror("close");
-    goto bail;
-  }
-
-#if defined(__GNUC__) && __GNUC__
-#if defined(__MACH__)
-  res = parse_macho(file_buf, file_size, mode);
-#elif defined(__ELF__)
-  res = parse_elf(file_buf, file_size, mode);
-#endif
-#endif
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
-  res = parse_coff(file_buf, file_size);
-#endif
-
-  free(file_buf);
-
-  if (!res)
-    return EXIT_SUCCESS;
-
-bail:
-  return EXIT_FAILURE;
-}

diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl
index 28ef69c..991b6ab 100755
--- a/libvpx/build/make/rtcd.pl
+++ b/libvpx/build/make/rtcd.pl

@@ -49,7 +49,7 @@
 
 my %config = ();
 while (<CONFIG_FILE>) {
-  next if !/^CONFIG_/;
+  next if !/^(?:CONFIG_|HAVE_)/;
   chomp;
   my @pair = split /=/;
   $config{$pair[0]} = $pair[1];
@@ -209,14 +209,16 @@
 #define RTCD_EXTERN extern
 #endif
 
+EOF
+
+process_forward_decls();
+print <<EOF;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 EOF
-
-process_forward_decls();
-print "\n";
 declare_function_pointers("c", @ALL_ARCHS);
 
 print <<EOF;
@@ -317,13 +319,14 @@
 
   print <<EOF;
 #if HAVE_DSPR2
+void vpx_dsputil_static_init();
 #if CONFIG_VP8
 void dsputil_static_init();
-dsputil_static_init();
 #endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
 #endif
 #endif
 }
@@ -365,30 +368,31 @@
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
-} elsif ($opts{arch} eq 'mips32') {
-  @ALL_ARCHS = filter(qw/mips32/);
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  @ALL_ARCHS = filter("$opts{arch}");
   open CONFIG_FILE, $opts{config} or
     die "Error opening config file '$opts{config}': $!\n";
   while (<CONFIG_FILE>) {
     if (/HAVE_DSPR2=yes/) {
-      @ALL_ARCHS = filter(qw/mips32 dspr2/);
+      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+      last;
+    }
+    if (/HAVE_MSA=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
       last;
     }
   }
   close CONFIG_FILE;
   mips;
-} elsif ($opts{arch} eq 'armv5te') {
-  @ALL_ARCHS = filter(qw/edsp/);
-  arm;
 } elsif ($opts{arch} eq 'armv6') {
-  @ALL_ARCHS = filter(qw/edsp media/);
+  @ALL_ARCHS = filter(qw/media/);
   arm;
-} elsif ($opts{arch} eq 'armv7') {
-  @ALL_ARCHS = filter(qw/edsp media neon_asm neon/);
+} elsif ($opts{arch} =~ /armv7\w?/) {
+  @ALL_ARCHS = filter(qw/media neon_asm neon/);
   @REQUIRES = filter(keys %required ? keys %required : qw/media/);
   &require(@REQUIRES);
   arm;
-} elsif ($opts{arch} eq 'armv8') {
+} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
   arm;
 } else {

diff --git a/libvpx/build/x86-msvs/obj_int_extract.bat b/libvpx/build/x86-msvs/obj_int_extract.bat
deleted file mode 100644
index dfa3b90..0000000
--- a/libvpx/build/x86-msvs/obj_int_extract.bat
+++ /dev/null

@@ -1,15 +0,0 @@
-REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM   %1 - Relative path to the directory containing the vp8 source directory.
-REM   %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-

diff --git a/libvpx/codereview.settings b/libvpx/codereview.settings
new file mode 100644
index 0000000..d7c8d39
--- /dev/null
+++ b/libvpx/codereview.settings

@@ -0,0 +1,4 @@
+# This file is used by gcl to get repository specific information.
+GERRIT_HOST: chromium-review.googlesource.com
+GERRIT_PORT: 29418
+CODE_REVIEW_SERVER: chromium-review.googlesource.com

diff --git a/libvpx/configure b/libvpx/configure
index 92ca061..ac196da 100755
--- a/libvpx/configure
+++ b/libvpx/configure

@@ -26,19 +26,19 @@
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
   ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
+  --cpu=CPU                       tune for the specified CPU (ARM: cortex-a8, X86: sse3)
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
   --sdk-path=PATH                 path to root of sdk (android builds only)
-  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
-                                  supported by hardware [auto]
   ${toggle_codec_srcs}            in/exclude codec library source code
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+  ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
   ${toggle_vp8}                   VP8 codec support
   ${toggle_vp9}                   VP9 codec support
+  ${toggle_vp10}                  VP10 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
-  ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
   ${toggle_vp9_postproc}          vp9 specific postprocessing
   ${toggle_multithread}           multithreaded encoding and decoding
@@ -56,6 +56,8 @@
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_temporal_denoising}
+                                  enable vp9 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
   ${toggle_libyuv}                enable libyuv
 
@@ -93,10 +95,6 @@
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv5te-android-gcc"
-all_platforms="${all_platforms} armv5te-linux-rvct"
-all_platforms="${all_platforms} armv5te-linux-gcc"
-all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
@@ -109,14 +107,10 @@
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
+all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -127,6 +121,7 @@
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -139,11 +134,14 @@
 all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
 all_platforms="${all_platforms} x86-win32-vs12"
+all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -154,12 +152,7 @@
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
-all_platforms="${all_platforms} universal-darwin8-gcc"
-all_platforms="${all_platforms} universal-darwin9-gcc"
-all_platforms="${all_platforms} universal-darwin10-gcc"
-all_platforms="${all_platforms} universal-darwin11-gcc"
-all_platforms="${all_platforms} universal-darwin12-gcc"
-all_platforms="${all_platforms} universal-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-win64-vs14"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -196,6 +189,14 @@
     [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi
 
+# disable codecs when their source directory does not exist
+[ -d "${source_path}/vp8" ] || disable_feature vp8
+[ -d "${source_path}/vp9" ] || disable_feature vp9
+[ -d "${source_path}/vp10" ] || disable_feature vp10
+
+# disable vp10 codec by default
+disable_feature vp10
+
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
@@ -205,45 +206,31 @@
 
 enable_feature static
 enable_feature optimizations
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature dependency_tracking
 enable_feature spatial_resampling
 enable_feature multithread
 enable_feature os_support
 enable_feature temporal_denoising
 
-[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout
-for d in vp8 vp9; do
-    [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout;
-done
-
-if ! enabled alt_tree_layout; then
-# development environment
-[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
-[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
-else
-# customer environment
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder"
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder"
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder"
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder"
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder
-
-[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt
-fi
-
-CODECS="$(echo ${CODECS} | tr ' ' '\n')"
-CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)"
+CODECS="
+    vp8_encoder
+    vp8_decoder
+    vp9_encoder
+    vp9_decoder
+    vp10_encoder
+    vp10_decoder
+"
+CODEC_FAMILIES="
+    vp8
+    vp9
+    vp10
+"
 
 ARCH_LIST="
     arm
     mips
     x86
     x86_64
-    ppc32
-    ppc64
 "
 ARCH_EXT_LIST="
     edsp
@@ -253,6 +240,8 @@
 
     mips32
     dspr2
+    msa
+    mips64
 
     mmx
     sse
@@ -262,24 +251,22 @@
     sse4_1
     avx
     avx2
-
-    altivec
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
     vpx_ports
     stdint_h
-    alt_tree_layout
     pthread_h
     sys_mman_h
     unistd_h
 "
 EXPERIMENT_LIST="
     spatial_svc
-    vp9_temporal_denoising
     fp_mb_stats
+    emulate_hardware
 "
 CONFIG_LIST="
+    dependency_tracking
     external_build
     install_docs
     install_bins
@@ -297,10 +284,6 @@
 
     codec_srcs
     debug_libs
-    fast_unaligned
-    mem_manager
-    mem_tracker
-    mem_checks
 
     dequant_tokens
     dc_recon
@@ -330,12 +313,15 @@
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    vp9_temporal_denoising
     coefficient_range_checking
+    vp9_highbitdepth
     experimental
     size_limit
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
+    dependency_tracking
     external_build
     extra_warnings
     werror
@@ -359,7 +345,6 @@
     libc
     as
     size_limit
-    fast_unaligned
     codec_srcs
     debug_libs
 
@@ -372,7 +357,6 @@
     ${CODECS}
     ${CODEC_FAMILIES}
     static_msvcrt
-    mem_tracker
     spatial_resampling
     realtime_only
     onthefly_bitpacking
@@ -388,7 +372,9 @@
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    vp9_temporal_denoising
     coefficient_range_checking
+    vp9_highbitdepth
     experimental
 "
 
@@ -443,24 +429,8 @@
 
 process_targets() {
     enabled child || write_common_config_banner
-    enabled universal || write_common_target_config_h  ${BUILD_PFX}vpx_config.h
-
-    # TODO: add host tools target (obj_int_extract, etc)
-
-    # For fat binaries, call configure recursively to configure for each
-    # binary architecture to be included.
-    if enabled universal; then
-        # Call configure (ourselves) for each subarchitecture
-        for arch in $fat_bin_archs; do
-            BUILD_PFX=${arch}/ toolchain=${arch} $self --child $cmdline_args || exit $?
-        done
-    fi
-
-    # The write_common_config (config.mk) logic is deferred until after the
-    # recursive calls to configure complete, because we want our universal
-    # targets to be executed last.
+    write_common_target_config_h ${BUILD_PFX}vpx_config.h
     write_common_config_targets
-    enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
 
     # Calculate the default distribution name, based on the enabled features
     cf=""
@@ -536,11 +506,11 @@
         # Can only build shared libs on a subset of platforms. Doing this check
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
-        if ! enabled linux; then
+        if ! enabled linux && ! enabled os2; then
             if enabled gnu; then
                 echo "--enable-shared is only supported on ELF; assuming this is OK"
             else
-                die "--enable-shared only supported on ELF for now"
+                die "--enable-shared only supported on ELF and OS/2 for now"
             fi
         fi
     fi
@@ -605,30 +575,6 @@
 process_toolchain() {
     process_common_toolchain
 
-    # Handle universal binaries for this architecture
-    case $toolchain in
-        universal-darwin*)
-            darwin_ver=${tgt_os##darwin}
-
-            # Snow Leopard (10.6/darwin10) dropped support for PPC
-            # Include PPC support for all prior versions
-            if [ $darwin_ver -lt 10 ]; then
-                fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
-            fi
-
-            # Tiger (10.4/darwin8) brought support for x86
-            if [ $darwin_ver -ge 8 ]; then
-                fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
-            fi
-
-            # Leopard (10.5/darwin9) brought 64 bit support
-            if [ $darwin_ver -ge 9 ]; then
-                fat_bin_archs="$fat_bin_archs x86_64-${tgt_os}-${tgt_cc}"
-            fi
-            ;;
-    esac
-
-
     # Enable some useful compiler flags
     if enabled gcc; then
         enabled werror && check_add_cflags -Werror
@@ -704,7 +650,7 @@
                  VCPROJ_SFX=vcproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
                  ;;
-             10|11|12)
+             10|11|12|14)
                  VCPROJ_SFX=vcxproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
                  enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
@@ -716,7 +662,7 @@
     esac
 
     # Other toolchain specific defaults
-    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+    case $toolchain in x86*) soft_enable postproc;; esac
 
     if enabled postproc_visualizer; then
         enabled postproc || die "postproc_visualizer requires postproc to be enabled"
@@ -780,6 +726,7 @@
 process "$@"
 print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */"
 cat <<EOF >> ${BUILD_PFX}vpx_config.c
+#include "vpx/vpx_codec.h"
 static const char* const cfg = "$CONFIGURE_ARGS";
 const char *vpx_codec_build_config(void) {return cfg;}
 EOF

diff --git a/libvpx/examples.mk b/libvpx/examples.mk
index 91bd45a..dfa5a65 100644
--- a/libvpx/examples.mk
+++ b/libvpx/examples.mk

@@ -22,16 +22,18 @@
                 third_party/libyuv/source/planar_functions.cc \
                 third_party/libyuv/source/row_any.cc \
                 third_party/libyuv/source/row_common.cc \
+                third_party/libyuv/source/row_gcc.cc \
                 third_party/libyuv/source/row_mips.cc \
                 third_party/libyuv/source/row_neon.cc \
                 third_party/libyuv/source/row_neon64.cc \
-                third_party/libyuv/source/row_posix.cc \
                 third_party/libyuv/source/row_win.cc \
                 third_party/libyuv/source/scale.cc \
+                third_party/libyuv/source/scale_any.cc \
                 third_party/libyuv/source/scale_common.cc \
+                third_party/libyuv/source/scale_gcc.cc \
                 third_party/libyuv/source/scale_mips.cc \
                 third_party/libyuv/source/scale_neon.cc \
-                third_party/libyuv/source/scale_posix.cc \
+                third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_win.cc \
 
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
@@ -55,6 +57,7 @@
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/mem_ops.h
 vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxdec.SRCS                 += vpx_ports/msvc.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
@@ -79,6 +82,7 @@
 vpxenc.SRCS                 += warnings.c warnings.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += vpx_ports/msvc.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += vpxstats.c vpxstats.h
 ifeq ($(CONFIG_LIBYUV),yes)
@@ -97,6 +101,7 @@
   vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
   vp9_spatial_svc_encoder.SRCS        += video_common.h
   vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
   vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
   vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
   vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
@@ -111,9 +116,10 @@
 vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
 vpx_temporal_svc_encoder.SRCS        += video_common.h
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.SRCS        += vpx_ports/msvc.h
 vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
-EXAMPLES-$(CONFIG_VP8_DECODER)     += simple_decoder.c
+EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
 simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
 simple_decoder.SRCS                += ivfdec.h ivfdec.c
 simple_decoder.SRCS                += tools_common.h tools_common.c
@@ -121,17 +127,19 @@
 simple_decoder.SRCS                += video_reader.h video_reader.c
 simple_decoder.SRCS                += vpx_ports/mem_ops.h
 simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
+simple_decoder.SRCS                += vpx_ports/msvc.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
-EXAMPLES-$(CONFIG_VP8_DECODER)     += postproc.c
+EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
 postproc.SRCS                      += vpx_ports/mem_ops.h
 postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
+postproc.SRCS                      += vpx_ports/msvc.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
-EXAMPLES-$(CONFIG_VP8_DECODER)     += decode_to_md5.c
+EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
 decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
 decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
@@ -139,31 +147,41 @@
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
 decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
 decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
+decode_to_md5.SRCS                 += vpx_ports/msvc.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += simple_encoder.c
+EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
 simple_encoder.SRCS             += ivfenc.h ivfenc.c
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
+simple_encoder.SRCS             += vpx_ports/msvc.h
 simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += twopass_encoder.c
+EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
+vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
+vp9_lossless_encoder.SRCS       += video_common.h
+vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.SRCS       += vpx_ports/msvc.h
+vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
+vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
+EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
 twopass_encoder.SRCS            += ivfenc.h ivfenc.c
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
+twopass_encoder.SRCS            += vpx_ports/msvc.h
 twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
-ifeq ($(CONFIG_DECODERS),yes)
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += decode_with_drops.c
+EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
 decode_with_drops.SRCS          += ivfdec.h ivfdec.c
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
 decode_with_drops.SRCS          += vpx_ports/mem_ops.h
 decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
-endif
+decode_with_drops.SRCS          += vpx_ports/msvc.h
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
@@ -171,6 +189,7 @@
 set_maps.SRCS                      += tools_common.h tools_common.c
 set_maps.SRCS                      += video_common.h
 set_maps.SRCS                      += video_writer.h video_writer.c
+set_maps.SRCS                      += vpx_ports/msvc.h
 set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 set_maps.DESCRIPTION                = Set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
@@ -178,6 +197,7 @@
 vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
@@ -185,7 +205,10 @@
 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
 EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
+vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
+vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
@@ -246,14 +269,6 @@
 $(foreach ex,$(ALL_EXAMPLES),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk))
 
 
-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_OBJS variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_OBJS variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_OBJS,BUILD_OBJS):=yes)
-
-
 # Create build/install dependencies for all examples. The common case
 # is handled here. The MSVS case is handled below.
 NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
@@ -261,24 +276,28 @@
 INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
 DIST-SRCS-yes              += $(ALL_SRCS)
 INSTALL-SRCS-yes           += $(UTIL_SRCS)
-OBJS-$(NOT_MSVS)           += $(if $(BUILD_OBJS),$(call objs,$(ALL_SRCS)))
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
 BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))
 
 
 # Instantiate linker template for all examples.
 CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
-SHARED_LIB_SUF=$(if $(filter darwin%,$(TGT_OS)),.dylib,.so)
+ifneq ($(filter darwin%,$(TGT_OS)),)
+SHARED_LIB_SUF=.dylib
+else
+ifneq ($(filter os2%,$(TGT_OS)),)
+SHARED_LIB_SUF=_dll.a
+else
+SHARED_LIB_SUF=.so
+endif
+endif
 CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
 $(foreach bin,$(BINS-yes),\
-    $(if $(BUILD_OBJS),$(eval $(bin):\
-        $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
-    $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
+    $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\
+    $(eval $(call linker_template,$(bin),\
         $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
         -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
-        )))\
-    $(if $(LIPO_OBJS),$(eval $(call lipo_bin_template,$(bin))))\
-    )
-
+        )))
 
 # The following pairs define a mapping of locations in the distribution
 # tree to locations in the source/build trees.
@@ -306,8 +325,8 @@
 # the makefiles). We may want to revisit this.
 define vcproj_template
 $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
-	@echo "    [vcproj] $$@"
-	$$(GEN_VCPROJ)\
+	$(if $(quiet),@echo "    [vcproj] $$@")
+	$(qexec)$$(GEN_VCPROJ)\
             --exe\
             --target=$$(TOOLCHAIN)\
             --name=$$(@:.$(VCPROJ_SFX)=)\
@@ -330,6 +349,7 @@
 #
 %.dox: %.c
 	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
 	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
 	@echo "   \includelineno $(<F)" >> $@
 	@echo "*/" >> $@

diff --git a/libvpx/examples/decode_to_md5.c b/libvpx/examples/decode_to_md5.c
index 1c56303..1ae7a4b 100644
--- a/libvpx/examples/decode_to_md5.c
+++ b/libvpx/examples/decode_to_md5.c

@@ -33,14 +33,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./md5_utils.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../md5_utils.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
@@ -73,7 +71,7 @@
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
   exit(EXIT_FAILURE);
 }

diff --git a/libvpx/examples/decode_with_drops.c b/libvpx/examples/decode_with_drops.c
index a20fdac..2233e47 100644
--- a/libvpx/examples/decode_with_drops.c
+++ b/libvpx/examples/decode_with_drops.c

@@ -56,18 +56,16 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
   exit(EXIT_FAILURE);
 }

diff --git a/libvpx/examples/postproc.c b/libvpx/examples/postproc.c
index 59c50b1..a8ac208 100644
--- a/libvpx/examples/postproc.c
+++ b/libvpx/examples/postproc.c

@@ -43,18 +43,16 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
   exit(EXIT_FAILURE);
 }

diff --git a/libvpx/examples/resize_util.c b/libvpx/examples/resize_util.c
index b068f55..e6fdd5b 100644
--- a/libvpx/examples/resize_util.c
+++ b/libvpx/examples/resize_util.c

@@ -15,15 +15,23 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./vp9/encoder/vp9_resize.h"
+#include "../tools_common.h"
+#include "../vp9/encoder/vp9_resize.h"
 
-static void usage(char *progname) {
+static const char *exec_name = NULL;
+
+static void usage() {
   printf("Usage:\n");
   printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
-         progname);
+         exec_name);
   printf("<output_yuv> [<frames>]\n");
 }
 
+void usage_exit(void) {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
 static int parse_dim(char *v, int *width, int *height) {
   char *x = strchr(v, 'x');
   if (x == NULL)
@@ -47,9 +55,11 @@
   int f, frames;
   int width, height, target_width, target_height;
 
+  exec_name = argv[0];
+
   if (argc < 5) {
     printf("Incorrect parameters:\n");
-    usage(argv[0]);
+    usage();
     return 1;
   }
 
@@ -57,25 +67,25 @@
   fout = argv[4];
   if (!parse_dim(argv[2], &width, &height)) {
     printf("Incorrect parameters: %s\n", argv[2]);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   if (!parse_dim(argv[3], &target_width, &target_height)) {
     printf("Incorrect parameters: %s\n", argv[3]);
-    usage(argv[0]);
+    usage();
     return 1;
   }
 
   fpin = fopen(fin, "rb");
   if (fpin == NULL) {
     printf("Can't open file %s to read\n", fin);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
     printf("Can't open file %s to write\n", fout);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   if (argc >= 6)

diff --git a/libvpx/examples/set_maps.c b/libvpx/examples/set_maps.c
index af8c582..1dc3ac0 100644
--- a/libvpx/examples/set_maps.c
+++ b/libvpx/examples/set_maps.c

@@ -42,20 +42,20 @@
 // Use the `simple_decoder` example to decode this sample, and observe
 // the change in the image at frames 22, 33, and 44.
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
           exec_name);
   exit(EXIT_FAILURE);
@@ -125,10 +125,11 @@
     die_codec(codec, "Failed to set active map");
 }
 
-static void encode_frame(vpx_codec_ctx_t *codec,
-                         vpx_image_t *img,
-                         int frame_index,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -137,6 +138,8 @@
     die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
       if (!vpx_video_writer_write_frame(writer,
@@ -150,6 +153,8 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
 }
 
 int main(int argc, char **argv) {
@@ -172,9 +177,10 @@
   memset(&info, 0, sizeof(info));
 
   encoder = get_vpx_encoder_by_name(argv[1]);
-  if (!encoder)
+  if (encoder == NULL) {
     die("Unsupported codec.");
-
+  }
+  assert(encoder != NULL);
   info.codec_fourcc = encoder->fourcc;
   info.frame_width = strtol(argv[2], NULL, 0);
   info.frame_height = strtol(argv[3], NULL, 0);
@@ -217,6 +223,7 @@
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
+  // Encode frames.
   while (vpx_img_read(&raw, infile)) {
     ++frame_count;
 
@@ -230,7 +237,10 @@
 
     encode_frame(&codec, &raw, frame_count, writer);
   }
-  encode_frame(&codec, NULL, -1, writer);
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {}
+
   printf("\n");
   fclose(infile);
   printf("Processed %d frames.\n", frame_count);

diff --git a/libvpx/examples/simple_decoder.c b/libvpx/examples/simple_decoder.c
index 3318758..8ccc810 100644
--- a/libvpx/examples/simple_decoder.c
+++ b/libvpx/examples/simple_decoder.c

@@ -29,30 +29,29 @@
 // -----------------
 // For decoders, you only have to include `vpx_decoder.h` and then any
 // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
 //
 // Initializing The Codec
 // ----------------------
-// The decoder is initialized by the following code. This is an example for
-// the VP8 decoder, but the code is analogous for all algorithms. Replace
-// `vpx_codec_vp8_dx()` with a pointer to the interface exposed by the
-// algorithm you want to use. The `cfg` argument is left as NULL in this
-// example, because we want the algorithm to determine the stream
-// configuration (width/height) and allocate memory automatically. This
-// parameter is generally only used if you need to preallocate memory,
-// particularly in External Memory Allocation mode.
+// The libvpx decoder is initialized by the call to vpx_codec_dec_init().
+// Determining the codec interface to use is handled by VpxVideoReader and the
+// functions prefixed with vpx_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a VPx file and which VPx
+// codec is contained within the file.
+// Note the NULL pointer passed to vpx_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
 //
 // Decoding A Frame
 // ----------------
 // Once the frame has been read into memory, it is decoded using the
 // `vpx_codec_decode` function. The call takes a pointer to the data
-// (`frame`) and the length of the data (`frame_sz`). No application data
+// (`frame`) and the length of the data (`frame_size`). No application data
 // is associated with the frame in this example, so the `user_priv`
 // parameter is NULL. The `deadline` parameter is left at zero for this
-// example. This parameter is generally only used when doing adaptive
-// postprocessing.
+// example. This parameter is generally only used when doing adaptive post
+// processing.
 //
 // Codecs may produce a variable number of output frames for every call to
 // `vpx_codec_decode`. These frames are retrieved by the
@@ -74,25 +73,22 @@
 // --------------
 // This example does not special case any error return codes. If there was
 // an error, a descriptive message is printed and the program exits. With
-// few exeptions, vpx_codec functions return an enumerated error status,
+// few exceptions, vpx_codec functions return an enumerated error status,
 // with the value `0` indicating success.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
-#include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
   exit(EXIT_FAILURE);
 }

diff --git a/libvpx/examples/simple_encoder.c b/libvpx/examples/simple_encoder.c
index 30bb73a..a307729 100644
--- a/libvpx/examples/simple_encoder.c
+++ b/libvpx/examples/simple_encoder.c

@@ -28,9 +28,7 @@
 // -----------------
 // For encoders, you only have to include `vpx_encoder.h` and then any
 // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
 //
 // Getting The Default Configuration
 // ---------------------------------
@@ -101,15 +99,14 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
               "<keyframe-interval> [<error-resilient>]\nSee comments in "

diff --git a/libvpx/examples/twopass_encoder.c b/libvpx/examples/twopass_encoder.c
index 369b1d8..aecc11d 100644
--- a/libvpx/examples/twopass_encoder.c
+++ b/libvpx/examples/twopass_encoder.c

@@ -28,9 +28,8 @@
 // Encoding A Frame
 // ----------------
 // Encoding a frame in two pass mode is identical to the simple encoder
-// example, except the deadline is set to VPX_DL_BEST_QUALITY to get the
-// best quality possible. VPX_DL_GOOD_QUALITY could also be used.
-//
+// example. To increase the quality while sacrificing encoding speed,
+// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY.
 //
 // Processing Statistics Packets
 // -----------------------------
@@ -52,27 +51,27 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
 
-static void get_frame_stats(vpx_codec_ctx_t *ctx,
-                            const vpx_image_t *img,
-                            vpx_codec_pts_t pts,
-                            unsigned int duration,
-                            vpx_enc_frame_flags_t flags,
-                            unsigned int deadline,
-                            vpx_fixed_buf_t *stats) {
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+                           const vpx_image_t *img,
+                           vpx_codec_pts_t pts,
+                           unsigned int duration,
+                           vpx_enc_frame_flags_t flags,
+                           unsigned int deadline,
+                           vpx_fixed_buf_t *stats) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -81,6 +80,8 @@
     die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_STATS_PKT) {
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -89,15 +90,18 @@
       stats->sz += pkt_size;
     }
   }
+
+  return got_pkts;
 }
 
-static void encode_frame(vpx_codec_ctx_t *ctx,
-                         const vpx_image_t *img,
-                         vpx_codec_pts_t pts,
-                         unsigned int duration,
-                         vpx_enc_frame_flags_t flags,
-                         unsigned int deadline,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *ctx,
+                        const vpx_image_t *img,
+                        vpx_codec_pts_t pts,
+                        unsigned int duration,
+                        vpx_enc_frame_flags_t flags,
+                        unsigned int deadline,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -106,6 +110,7 @@
     die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
 
@@ -117,19 +122,90 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+                             FILE *infile,
+                             const VpxInterface *encoder,
+                             const vpx_codec_enc_cfg_t *cfg) {
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+  vpx_fixed_buf_t stats = {NULL, 0};
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
+                    &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+                         VPX_DL_GOOD_QUALITY, &stats)) {}
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+                  FILE *infile,
+                  const char *outfile_name,
+                  const VpxInterface *encoder,
+                  const vpx_codec_enc_cfg_t *cfg) {
+  VpxVideoInfo info = {
+    encoder->fourcc,
+    cfg->g_w,
+    cfg->g_h,
+    {cfg->g_timebase.num, cfg->g_timebase.den}
+  };
+  VpxVideoWriter *writer = NULL;
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing", outfile_name);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
+
+  printf("\n");
+
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
 }
 
 int main(int argc, char **argv) {
   FILE *infile = NULL;
-  VpxVideoWriter *writer = NULL;
+  int w, h;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  vpx_fixed_buf_t stats = {0};
-  VpxVideoInfo info = {0};
+  vpx_fixed_buf_t stats;
+
   const VpxInterface *encoder = NULL;
-  int pass;
   const int fps = 30;        // TODO(dkovalev) add command line argument
   const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
@@ -146,85 +222,44 @@
   if (!encoder)
     die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
-  info.time_base.numerator = 1;
-  info.time_base.denominator = fps;
-  info.frame_width = strtol(width_arg, NULL, 0);
-  info.frame_height = strtol(height_arg, NULL, 0);
+  w = strtol(width_arg, NULL, 0);
+  h = strtol(height_arg, NULL, 0);
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
-    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
-  }
+  if (w  <= 0 || h <= 0 || (w % 2) != 0 || (h  % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
 
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
-    die("Failed to allocate image", info.frame_width, info.frame_height);
-  }
-
-  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing", outfile_arg);
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
+  // Configuration
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
   if (res)
     die_codec(&codec, "Failed to get default codec config.");
 
-  cfg.g_w = info.frame_width;
-  cfg.g_h = info.frame_height;
-  cfg.g_timebase.num = info.time_base.numerator;
-  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
   cfg.rc_target_bitrate = bitrate;
 
-  for (pass = 0; pass < 2; ++pass) {
-    int frame_count = 0;
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
 
-    if (pass == 0) {
-      cfg.g_pass = VPX_RC_FIRST_PASS;
-    } else {
-      cfg.g_pass = VPX_RC_LAST_PASS;
-      cfg.rc_twopass_stats_in = stats;
-    }
+  // Pass 0
+  cfg.g_pass = VPX_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg);
 
-    if (!(infile = fopen(infile_arg, "rb")))
-      die("Failed to open %s for reading", infile_arg);
-
-    if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-      die_codec(&codec, "Failed to initialize encoder");
-
-    while (vpx_img_read(&raw, infile)) {
-      ++frame_count;
-
-      if (pass == 0) {
-        get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                        &stats);
-      } else {
-        encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                     writer);
-      }
-    }
-
-    if (pass == 0) {
-      get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                      &stats);
-    } else {
-      printf("\n");
-    }
-
-    fclose(infile);
-    printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count);
-    if (vpx_codec_destroy(&codec))
-      die_codec(&codec, "Failed to destroy codec.");
-  }
-
-  vpx_img_free(&raw);
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = VPX_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg);
   free(stats.buf);
 
-  vpx_video_writer_close(writer);
+  vpx_img_free(&raw);
+  fclose(infile);
 
   return EXIT_SUCCESS;
 }

diff --git a/libvpx/examples/vp8_multi_resolution_encoder.c b/libvpx/examples/vp8_multi_resolution_encoder.c
index d41e442..2b03204 100644
--- a/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libvpx/examples/vp8_multi_resolution_encoder.c

@@ -13,21 +13,38 @@
  * High-resolution input video is down-sampled to lower-resolutions. The
  * encoder then encodes the video and outputs multiple bitstreams with
  * different resolutions.
+ *
+ * This test also allows for settings temporal layers for each spatial layer.
+ * Different number of temporal layers per spatial stream may be used.
+ * Currently up to 3 temporal layers per spatial stream (encoder) are supported
+ * in this test.
  */
+
+#include "./vpx_config.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
 #include <math.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
+#include <assert.h>
+#include <sys/time.h>
+#if USE_POSIX_MMAP
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+#include "vpx_ports/vpx_timer.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
-#include "./tools_common.h"
+#include "../tools_common.h"
 #define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056
 
-void usage_exit() {
+void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
@@ -39,8 +56,13 @@
  * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
  * 320x180(level 2) respectively.
  */
+
+/* Number of encoders (spatial resolutions) used in this test. */
 #define NUM_ENCODERS 3
 
+/* Maximum number of temporal layers allowed for this test. */
+#define MAX_NUM_TEMPORAL_LAYERS 3
+
 /* This example uses the scaler function in libyuv. */
 #include "third_party/libyuv/include/libyuv/basic_types.h"
 #include "third_party/libyuv/include/libyuv/scale.h"
@@ -152,21 +174,172 @@
     (void) fwrite(header, 1, 12, outfile);
 }
 
+/* Temporal scaling parameters */
+/* This sets all the temporal layer parameters given |num_temporal_layers|,
+ * including the target bit allocation across temporal layers. Bit allocation
+ * parameters will be passed in as user parameters in another version.
+ */
+static void set_temporal_layer_pattern(int num_temporal_layers,
+                                       vpx_codec_enc_cfg_t *cfg,
+                                       int bitrate,
+                                       int *layer_flags)
+{
+    assert(num_temporal_layers <= MAX_NUM_TEMPORAL_LAYERS);
+    switch (num_temporal_layers)
+    {
+    case 1:
+    {
+        /* 1-layer */
+        cfg->ts_number_layers     = 1;
+        cfg->ts_periodicity       = 1;
+        cfg->ts_rate_decimator[0] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_target_bitrate[0] = bitrate;
+
+        // Update L only.
+        layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+        break;
+    }
+
+    case 2:
+    {
+        /* 2-layers, with sync point at first frame of layer 1. */
+        cfg->ts_number_layers     = 2;
+        cfg->ts_periodicity       = 2;
+        cfg->ts_rate_decimator[0] = 2;
+        cfg->ts_rate_decimator[1] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 1;
+        // Use 60/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[1] = bitrate;
+
+        /* 0=L, 1=GF */
+        // ARF is used as predictor for all frames, and is only updated on
+        // key frame. Sync point every 8 frames.
+
+        // Layer 0: predict from L and ARF, update L and G.
+        layer_flags[0] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: sync point: predict from L and ARF, and update G.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 0, predict from L and ARF, update L.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF  |
+                         VP8_EFLAG_NO_UPD_GF  |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: predict from L, G and ARF, and update G.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0
+        layer_flags[4] = layer_flags[2];
+
+        // Layer 1
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 0
+        layer_flags[6] = layer_flags[4];
+
+        // Layer 1
+        layer_flags[7] = layer_flags[5];
+        break;
+    }
+
+    case 3:
+    default:
+    {
+        // 3-layers structure where ARF is used as predictor for all frames,
+        // and is only updated on key frame.
+        // Sync points for layer 1 and 2 every 8 frames.
+        cfg->ts_number_layers     = 3;
+        cfg->ts_periodicity       = 4;
+        cfg->ts_rate_decimator[0] = 4;
+        cfg->ts_rate_decimator[1] = 2;
+        cfg->ts_rate_decimator[2] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 2;
+        cfg->ts_layer_id[2] = 1;
+        cfg->ts_layer_id[3] = 2;
+        // Use 40/20/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.4f * bitrate;
+        cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[2] = bitrate;
+
+        /* 0=L, 1=GF, 2=ARF */
+
+        // Layer 0: predict from L and ARF; update L and G.
+        layer_flags[0] =  VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: sync point: predict from L and ARF; update none.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 1: sync point: predict from L and ARF; update G.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0: predict from L and ARF; update L.
+        layer_flags[4] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 1: predict from L, G, ARF; update G.
+        layer_flags[6] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[7] = layer_flags[3];
+        break;
+    }
+    }
+}
+
+/* The periodicity of the pattern given the number of temporal layers. */
+static int periodicity_to_num_layers[MAX_NUM_TEMPORAL_LAYERS] = {1, 8, 8};
+
 int main(int argc, char **argv)
 {
-    FILE                *infile, *outfile[NUM_ENCODERS];
+    FILE                 *infile, *outfile[NUM_ENCODERS];
+    FILE                 *downsampled_input[NUM_ENCODERS - 1];
+    char                 filename[50];
     vpx_codec_ctx_t      codec[NUM_ENCODERS];
     vpx_codec_enc_cfg_t  cfg[NUM_ENCODERS];
-    vpx_codec_pts_t      frame_cnt = 0;
+    int                  frame_cnt = 0;
     vpx_image_t          raw[NUM_ENCODERS];
     vpx_codec_err_t      res[NUM_ENCODERS];
 
     int                  i;
     long                 width;
     long                 height;
+    int                  length_frame;
     int                  frame_avail;
     int                  got_data;
     int                  flags = 0;
+    int                  layer_id = 0;
+
+    int                  layer_flags[VPX_TS_MAX_PERIODICITY * NUM_ENCODERS]
+                                     = {0};
+    int                  flag_periodicity;
 
     /*Currently, only realtime mode is supported in multi-resolution encoding.*/
     int                  arg_deadline = VPX_DL_REALTIME;
@@ -175,39 +348,51 @@
        don't need to know PSNR, which will skip PSNR calculation and save
        encoding time. */
     int                  show_psnr = 0;
+    int                  key_frame_insert = 0;
     uint64_t             psnr_sse_total[NUM_ENCODERS] = {0};
     uint64_t             psnr_samples_total[NUM_ENCODERS] = {0};
     double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
     int                  psnr_count[NUM_ENCODERS] = {0};
 
+    double               cx_time = 0;
+    struct  timeval      tv1, tv2, difftv;
+
     /* Set the required target bitrates for each resolution level.
      * If target bitrate for highest-resolution level is set to 0,
      * (i.e. target_bitrate[0]=0), we skip encoding at that level.
      */
     unsigned int         target_bitrate[NUM_ENCODERS]={1000, 500, 100};
+
     /* Enter the frame rate of the input video */
     int                  framerate = 30;
+
     /* Set down-sampling factor for each resolution level.
        dsf[0] controls down sampling from level 0 to level 1;
        dsf[1] controls down sampling from level 1 to level 2;
        dsf[2] is not used. */
     vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
 
-    if(argc!= (5+NUM_ENCODERS))
-        die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+    /* Set the number of temporal layers for each encoder/resolution level,
+     * starting from highest resoln down to lowest resoln. */
+    unsigned int         num_temporal_layers[NUM_ENCODERS] = {3, 3, 3};
+
+    if(argc!= (7 + 3 * NUM_ENCODERS))
+        die("Usage: %s <width> <height> <frame_rate>  <infile> <outfile(s)> "
+            "<rate_encoder(s)> <temporal_layer(s)> <key_frame_insert> <output psnr?> \n",
             argv[0]);
 
     printf("Using %s\n",vpx_codec_iface_name(interface));
 
     width = strtol(argv[1], NULL, 0);
     height = strtol(argv[2], NULL, 0);
+    framerate = strtol(argv[3], NULL, 0);
 
     if(width < 16 || width%2 || height <16 || height%2)
         die("Invalid resolution: %ldx%ld", width, height);
 
     /* Open input video file for encoding */
-    if(!(infile = fopen(argv[3], "rb")))
-        die("Failed to open %s for reading", argv[3]);
+    if(!(infile = fopen(argv[4], "rb")))
+        die("Failed to open %s for reading", argv[4]);
 
     /* Open output file for each encoder to output bitstreams */
     for (i=0; i< NUM_ENCODERS; i++)
@@ -218,11 +403,40 @@
             continue;
         }
 
-        if(!(outfile[i] = fopen(argv[i+4], "wb")))
+        if(!(outfile[i] = fopen(argv[i+5], "wb")))
             die("Failed to open %s for writing", argv[i+4]);
     }
 
-    show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
+    // Bitrates per spatial layer: overwrite default rates above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    }
+
+    // Temporal layers per spatial layers: overwrite default settings above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+        if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
+          die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
+              num_temporal_layers);
+    }
+
+    /* Open file to write out each spatially downsampled input stream. */
+    for (i=0; i< NUM_ENCODERS - 1; i++)
+    {
+       // Highest resoln is encoder 0.
+        if (sprintf(filename,"ds%d.yuv",NUM_ENCODERS - i) < 0)
+        {
+            return EXIT_FAILURE;
+        }
+        downsampled_input[i] = fopen(filename,"wb");
+    }
+
+    key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+
+    show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+
 
     /* Populate default encoder configuration */
     for (i=0; i< NUM_ENCODERS; i++)
@@ -240,14 +454,13 @@
     /* Highest-resolution encoder settings */
     cfg[0].g_w = width;
     cfg[0].g_h = height;
-    cfg[0].g_threads = 1;                           /* number of threads used */
-    cfg[0].rc_dropframe_thresh = 30;
+    cfg[0].rc_dropframe_thresh = 0;
     cfg[0].rc_end_usage = VPX_CBR;
     cfg[0].rc_resize_allowed = 0;
-    cfg[0].rc_min_quantizer = 4;
+    cfg[0].rc_min_quantizer = 2;
     cfg[0].rc_max_quantizer = 56;
-    cfg[0].rc_undershoot_pct = 98;
-    cfg[0].rc_overshoot_pct = 100;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
     cfg[0].rc_buf_initial_sz = 500;
     cfg[0].rc_buf_optimal_sz = 600;
     cfg[0].rc_buf_sz = 1000;
@@ -258,7 +471,6 @@
     /* Note: These 3 settings are copied to all levels. But, except the lowest
      * resolution level, all other levels are set to VPX_KF_DISABLED internally.
      */
-    //cfg[0].kf_mode           = VPX_KF_DISABLED;
     cfg[0].kf_mode           = VPX_KF_AUTO;
     cfg[0].kf_min_dist = 3000;
     cfg[0].kf_max_dist = 3000;
@@ -272,7 +484,6 @@
     {
         memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
 
-        cfg[i].g_threads = 1;                       /* number of threads used */
         cfg[i].rc_target_bitrate = target_bitrate[i];
 
         /* Note: Width & height of other-resolution encoders are calculated
@@ -292,6 +503,13 @@
         if((cfg[i].g_h)%2)cfg[i].g_h++;
     }
 
+
+    // Set the number of threads per encode/spatial layer.
+    // (1, 1, 1) means no encoder threading.
+    cfg[0].g_threads = 2;
+    cfg[1].g_threads = 1;
+    cfg[2].g_threads = 1;
+
     /* Allocate image for each encoder */
     for (i=0; i< NUM_ENCODERS; i++)
         if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
@@ -306,6 +524,15 @@
         if(outfile[i])
             write_ivf_file_header(outfile[i], &cfg[i], 0);
 
+    /* Temporal layers settings */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        set_temporal_layer_pattern(num_temporal_layers[i],
+                                   &cfg[i],
+                                   cfg[i].rc_target_bitrate,
+                                   &layer_flags[i * VPX_TS_MAX_PERIODICITY]);
+    }
+
     /* Initialize multi-encoder */
     if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
                                 (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
@@ -316,15 +543,16 @@
     for ( i=0; i<NUM_ENCODERS; i++)
     {
         int speed = -6;
+        /* Lower speed for the lowest resolution. */
+        if (i == NUM_ENCODERS - 1) speed = -4;
         if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
             die_codec(&codec[i], "Failed to set cpu_used");
     }
 
-    /* Set static threshold. */
+    /* Set static threshold = 1 for all encoders */
     for ( i=0; i<NUM_ENCODERS; i++)
     {
-        unsigned int static_thresh = 1;
-        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
+        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
             die_codec(&codec[i], "Failed to set static threshold");
     }
 
@@ -338,6 +566,23 @@
             die_codec(&codec[i], "Failed to set noise_sensitivity");
     }
 
+    /* Set the number of token partitions */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_TOKEN_PARTITIONS, 1))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
+
+    /* Set the max intra target bitrate */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        unsigned int max_intra_size_pct =
+            (int)(((double)cfg[0].rc_buf_optimal_sz * 0.5) * framerate / 10);
+        if(vpx_codec_control(&codec[i], VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                             max_intra_size_pct))
+            die_codec(&codec[i], "Failed to set static threshold");
+       //printf("%d %d \n",i,max_intra_size_pct);
+    }
 
     frame_avail = 1;
     got_data = 0;
@@ -364,18 +609,55 @@
                           raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
                           raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
                           raw[i].d_w, raw[i].d_h, 1);
+                /* Write out down-sampled input. */
+                length_frame = cfg[i].g_w *  cfg[i].g_h *3/2;
+                if (fwrite(raw[i].planes[0], 1, length_frame,
+                           downsampled_input[NUM_ENCODERS - i - 1]) !=
+                               length_frame)
+                {
+                    return EXIT_FAILURE;
+                }
             }
         }
 
-        /* Encode each frame at multi-levels */
-        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
-            frame_cnt, 1, flags, arg_deadline))
-            die_codec(&codec[0], "Failed to encode frame");
+        /* Set the flags (reference and update) for all the encoders.*/
+        for ( i=0; i<NUM_ENCODERS; i++)
+        {
+            layer_id = cfg[i].ts_layer_id[frame_cnt % cfg[i].ts_periodicity];
+            flags = 0;
+            flag_periodicity = periodicity_to_num_layers
+                [num_temporal_layers[i] - 1];
+            flags = layer_flags[i * VPX_TS_MAX_PERIODICITY +
+                                frame_cnt % flag_periodicity];
+            // Key frame flag for first frame.
+            if (frame_cnt == 0)
+            {
+                flags |= VPX_EFLAG_FORCE_KF;
+            }
+            if (frame_cnt > 0 && frame_cnt == key_frame_insert)
+            {
+                flags = VPX_EFLAG_FORCE_KF;
+            }
 
+            vpx_codec_control(&codec[i], VP8E_SET_FRAME_FLAGS, flags);
+            vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        }
+
+        gettimeofday(&tv1, NULL);
+        /* Encode each frame at multi-levels */
+        /* Note the flags must be set to 0 in the encode call if they are set
+           for each frame with the vpx_codec_control(), as done above. */
+        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+            frame_cnt, 1, 0, arg_deadline))
+        {
+            die_codec(&codec[0], "Failed to encode frame");
+        }
+        gettimeofday(&tv2, NULL);
+        timersub(&tv2, &tv1, &difftv);
+        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
         for (i=NUM_ENCODERS-1; i>=0 ; i--)
         {
             got_data = 0;
-
             while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
             {
                 got_data = 1;
@@ -394,7 +676,6 @@
                             psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
                             for (j = 0; j < 4; j++)
                             {
-                                //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
                                 psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
                             }
                             psnr_count[i]++;
@@ -405,13 +686,15 @@
                         break;
                 }
                 printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
-                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"");
                 fflush(stdout);
             }
         }
         frame_cnt++;
     }
     printf("\n");
+    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
+           1000000 * (double)frame_cnt / (double)cx_time);
 
     fclose(infile);
 

diff --git a/libvpx/examples/vp8cx_set_ref.c b/libvpx/examples/vp8cx_set_ref.c
index 46a40ca..8b4cc30 100644
--- a/libvpx/examples/vp8cx_set_ref.c
+++ b/libvpx/examples/vp8cx_set_ref.c

@@ -50,25 +50,25 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> <frame>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
 
-static void encode_frame(vpx_codec_ctx_t *codec,
-                         vpx_image_t *img,
-                         int frame_index,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -77,6 +77,8 @@
     die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
       if (!vpx_video_writer_write_frame(writer,
@@ -90,6 +92,8 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
 }
 
 int main(int argc, char **argv) {
@@ -160,6 +164,7 @@
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
+  // Encode frames.
   while (vpx_img_read(&raw, infile)) {
     if (frame_count + 1 == update_frame_num) {
       vpx_ref_frame_t ref;
@@ -171,7 +176,9 @@
 
     encode_frame(&codec, &raw, frame_count++, writer);
   }
-  encode_frame(&codec, NULL, -1, writer);
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {}
 
   printf("\n");
   fclose(infile);

diff --git a/libvpx/examples/vp9_lossless_encoder.c b/libvpx/examples/vp9_lossless_encoder.c
new file mode 100644
index 0000000..8272516
--- /dev/null
+++ b/libvpx/examples/vp9_lossless_encoder.c

@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
+                  "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+                                               flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  if (argc < 5)
+    die("Invalid number of arguments");
+
+  encoder = get_vpx_encoder_by_name("vp9");
+  if (!encoder)
+     die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}

diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index 223f37e..5a60976 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c

@@ -14,40 +14,50 @@
  * that benefit from a scalable bitstream.
  */
 
+#include <math.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 
-#include "./args.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+
+#include "../args.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
-#include "./vpxstats.h"
+#include "../vpxstats.h"
+#define OUTPUT_RC_STATS 1
 
 static const arg_def_t skip_frames_arg =
     ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
     ARG_DEF("f", "frames", 1, "number of frames to encode");
+static const arg_def_t threads_arg =
+    ARG_DEF("th", "threads", 1, "number of threads to use");
+#if OUTPUT_RC_STATS
+static const arg_def_t output_rc_stats_arg =
+    ARG_DEF("rcstat", "output_rc_stats", 1, "output rc stats");
+#endif
 static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width");
 static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height");
 static const arg_def_t timebase_arg =
     ARG_DEF("t", "timebase", 1, "timebase (num/den)");
 static const arg_def_t bitrate_arg = ARG_DEF(
     "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
-static const arg_def_t layers_arg =
-    ARG_DEF("l", "layers", 1, "number of SVC layers");
+static const arg_def_t spatial_layers_arg =
+    ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+    ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
+static const arg_def_t temporal_layering_mode_arg =
+    ARG_DEF("tlm", "temporal-layering-mode", 1, "temporal layering scheme."
+        "VP9E_TEMPORAL_LAYERING_MODE");
 static const arg_def_t kf_dist_arg =
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t quantizers_arg =
-    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
-            "be applied to key frames if -qn is not specified (lowest to "
-            "highest layer)");
 static const arg_def_t passes_arg =
     ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
 static const arg_def_t pass_arg =
@@ -62,13 +72,44 @@
     ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
 static const arg_def_t max_bitrate_arg =
     ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
+static const arg_def_t lag_in_frame_arg =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Number of frame to input before "
+        "generating any outputs");
+static const arg_def_t rc_end_usage_arg =
+    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
+static const arg_def_t speed_arg =
+    ARG_DEF("sp", "speed", 1, "speed configuration");
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdepth_arg =
+    ARG_DEF_ENUM("d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ",
+                 bitdepth_enum);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 
 static const arg_def_t *svc_args[] = {
   &frames_arg,        &width_arg,         &height_arg,
-  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
-  &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
-  &min_bitrate_arg,   &max_bitrate_arg,   NULL
+  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
+  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
+  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
+  &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
+  &lag_in_frame_arg,  &threads_arg,
+#if OUTPUT_RC_STATS
+  &output_rc_stats_arg,
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdepth_arg,
+#endif
+  &speed_arg,
+  &rc_end_usage_arg,  NULL
 };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -79,7 +120,12 @@
 static const uint32_t default_timebase_den = 60;
 static const uint32_t default_bitrate = 1000;
 static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_temporal_layers = 1;
 static const uint32_t default_kf_dist = 100;
+static const uint32_t default_temporal_layering_mode = 0;
+static const uint32_t default_output_rc_stats = 0;
+static const int32_t default_speed = -1;  // -1 means use library default.
+static const uint32_t default_threads = 0;  // zero means use library default.
 
 typedef struct {
   const char *input_filename;
@@ -94,7 +140,7 @@
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
           exec_name);
   fprintf(stderr, "Options:\n");
@@ -115,10 +161,18 @@
   const char *fpf_file_name = NULL;
   unsigned int min_bitrate = 0;
   unsigned int max_bitrate = 0;
+  char string_options[1024] = {0};
 
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
   svc_ctx->log_level = SVC_LOG_DEBUG;
   svc_ctx->spatial_layers = default_spatial_layers;
+  svc_ctx->temporal_layers = default_temporal_layers;
+  svc_ctx->temporal_layering_mode = default_temporal_layering_mode;
+#if OUTPUT_RC_STATS
+  svc_ctx->output_rc_stat = default_output_rc_stats;
+#endif
+  svc_ctx->speed = default_speed;
+  svc_ctx->threads = default_threads;
 
   // start with default encoder configuration
   res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -156,15 +210,30 @@
       enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &skip_frames_arg, argi)) {
       app_input->frames_to_skip = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &layers_arg, argi)) {
+    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
       svc_ctx->spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+      svc_ctx->temporal_layers = arg_parse_uint(&arg);
+#if OUTPUT_RC_STATS
+    } else if (arg_match(&arg, &output_rc_stats_arg, argi)) {
+      svc_ctx->output_rc_stat = arg_parse_uint(&arg);
+#endif
+    } else if (arg_match(&arg, &speed_arg, argi)) {
+      svc_ctx->speed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &threads_arg, argi)) {
+      svc_ctx->threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
+      svc_ctx->temporal_layering_mode =
+          enc_cfg->temporal_layering_mode = arg_parse_int(&arg);
+      if (svc_ctx->temporal_layering_mode) {
+        enc_cfg->g_error_resilient = 1;
+      }
     } else if (arg_match(&arg, &kf_dist_arg, argi)) {
       enc_cfg->kf_min_dist = arg_parse_uint(&arg);
       enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
-      vpx_svc_set_scale_factors(svc_ctx, arg.val);
-    } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val);
+      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {
@@ -178,18 +247,49 @@
     } else if (arg_match(&arg, &fpf_name_arg, argi)) {
       fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
-      enc_cfg->rc_min_quantizer = arg_parse_uint(&arg);
+      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &max_q_arg, argi)) {
-      enc_cfg->rc_max_quantizer = arg_parse_uint(&arg);
+      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
       min_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
       max_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frame_arg, argi)) {
+      enc_cfg->g_lag_in_frames = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &rc_end_usage_arg, argi)) {
+      enc_cfg->rc_end_usage = arg_parse_uint(&arg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
+      switch (enc_cfg->g_bit_depth) {
+        case VPX_BITS_8:
+          enc_cfg->g_input_bit_depth = 8;
+          enc_cfg->g_profile = 0;
+          break;
+        case VPX_BITS_10:
+          enc_cfg->g_input_bit_depth = 10;
+          enc_cfg->g_profile = 2;
+          break;
+         case VPX_BITS_12:
+          enc_cfg->g_input_bit_depth = 12;
+          enc_cfg->g_profile = 2;
+          break;
+        default:
+          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+          break;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     } else {
       ++argj;
     }
   }
 
+  // There will be a space in front of the string options
+  if (strlen(string_options) > 0)
+    vpx_svc_set_options(svc_ctx, string_options + 1);
+
   if (passes == 0 || passes == 1) {
     if (pass) {
       fprintf(stderr, "pass is ignored since there's only one pass\n");
@@ -260,6 +360,185 @@
       enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
 }
 
+#if OUTPUT_RC_STATS
+// For rate control encoding stats.
+struct RateControlStats {
+  // Number of input frames per layer.
+  int layer_input_frames[VPX_MAX_LAYERS];
+  // Total (cumulative) number of encoded frames per layer.
+  int layer_tot_enc_frames[VPX_MAX_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[VPX_MAX_LAYERS];
+  // Framerate per layer (cumulative).
+  double layer_framerate[VPX_MAX_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[VPX_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[VPX_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[VPX_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative).
+  double layer_encoding_bitrate[VPX_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-time encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+};
+
+// Note: these rate control stats assume only 1 key frame in the
+// sequence (i.e., first frame only).
+static void set_rate_control_stats(struct RateControlStats *rc,
+                                     vpx_codec_enc_cfg_t *cfg) {
+  unsigned int sl, tl;
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int tlayer0 = sl * cfg->ts_number_layers;
+      rc->layer_framerate[layer] =
+          framerate / cfg->ts_rate_decimator[tl];
+      if (tl > 0) {
+        rc->layer_pfb[layer] = 1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+                cfg->layer_target_bitrate[layer - 1]) /
+            (rc->layer_framerate[layer] -
+                rc->layer_framerate[layer - 1]);
+      } else {
+        rc->layer_pfb[tlayer0] = 1000.0 *
+            cfg->layer_target_bitrate[tlayer0] /
+            rc->layer_framerate[tlayer0];
+      }
+      rc->layer_input_frames[layer] = 0;
+      rc->layer_enc_frames[layer] = 0;
+      rc->layer_tot_enc_frames[layer] = 0;
+      rc->layer_encoding_bitrate[layer] = 0.0;
+      rc->layer_avg_frame_size[layer] = 0.0;
+      rc->layer_avg_rate_mismatch[layer] = 0.0;
+    }
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlStats *rc,
+                                          vpx_codec_enc_cfg_t *cfg,
+                                          int frame_cnt) {
+  unsigned int sl, tl;
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+  printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
+      cfg->ss_number_layers, cfg->ts_number_layers);
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int num_dropped = (tl > 0) ?
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) :
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1);
+      if (!sl)
+        tot_num_frames += rc->layer_input_frames[layer];
+      rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
+          rc->layer_encoding_bitrate[layer] / tot_num_frames;
+      rc->layer_avg_frame_size[layer] = rc->layer_avg_frame_size[layer] /
+          rc->layer_enc_frames[layer];
+      rc->layer_avg_rate_mismatch[layer] =
+          100.0 * rc->layer_avg_rate_mismatch[layer] /
+          rc->layer_enc_frames[layer];
+      printf("For layer#: sl%d tl%d \n", sl, tl);
+      printf("Bitrate (target vs actual): %d %f.0 kbps\n",
+             cfg->layer_target_bitrate[layer],
+             rc->layer_encoding_bitrate[layer]);
+      printf("Average frame size (target vs actual): %f %f bits\n",
+             rc->layer_pfb[layer], rc->layer_avg_frame_size[layer]);
+      printf("Average rate_mismatch: %f\n",
+             rc->layer_avg_rate_mismatch[layer]);
+      printf("Number of input frames, encoded (non-key) frames, "
+          "and percent dropped frames: %d %d %f.0 \n",
+          rc->layer_input_frames[layer], rc->layer_enc_frames[layer],
+          100.0 * num_dropped / rc->layer_input_frames[layer]);
+      printf("\n");
+    }
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n", rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if (frame_cnt != tot_num_frames)
+    die("Error: Number of input frames not equal to output encoded frames != "
+        "%d tot_num_frames = %d\n", frame_cnt, tot_num_frames);
+}
+
+vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                       size_t data_sz,
+                                       uint32_t sizes[8], int *count) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  marker = *(data + data_sz - 1);
+  *count = 0;
+
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = *(data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+#endif
+
 int main(int argc, const char **argv) {
   AppInput app_input = {0};
   VpxVideoWriter *writer = NULL;
@@ -275,16 +554,33 @@
   int frame_duration = 1; /* 1 timebase tick per frame */
   FILE *infile = NULL;
   int end_of_stream = 0;
-  int frame_size;
-
+  int frames_received = 0;
+#if OUTPUT_RC_STATS
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
+  struct RateControlStats rc;
+  vpx_svc_layer_id_t layer_id;
+  int sl, tl;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
+#endif
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
   exec_name = argv[0];
   parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
 
   // Allocate image buffer
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ?
+                         VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
+                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#else
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
+    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (!(infile = fopen(app_input.input_filename, "rb")))
     die("Failed to open %s for reading\n", app_input.input_filename);
@@ -294,15 +590,16 @@
       VPX_CODEC_OK)
     die("Failed to initialize encoder\n");
 
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    set_rate_control_stats(&rc, &enc_cfg);
+    framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
+  }
+#endif
+
   info.codec_fourcc = VP9_FOURCC;
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
-  if (vpx_svc_get_layer_resolution(&svc_ctx, svc_ctx.spatial_layers - 1,
-                                   (unsigned int *)&info.frame_width,
-                                   (unsigned int *)&info.frame_height) !=
-      VPX_CODEC_OK) {
-    die("Failed to get output resolution");
-  }
 
   if (!(app_input.passes == 2 && app_input.pass == 1)) {
     // We don't save the bitstream for the 1st pass on two pass rate control
@@ -311,13 +608,35 @@
     if (!writer)
       die("Failed to open %s for writing\n", app_input.output_filename);
   }
+#if OUTPUT_RC_STATS
+  // For now, just write temporal layer streams.
+  // TODO(wonkap): do spatial by re-writing superframe.
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      char file_name[PATH_MAX];
+
+      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
+               app_input.output_filename, tl);
+      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[tl])
+        die("Failed to open %s for writing", file_name);
+    }
+  }
+#endif
 
   // skip initial frames
   for (i = 0; i < app_input.frames_to_skip; ++i)
     vpx_img_read(&raw, infile);
 
+  if (svc_ctx.speed != -1)
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
+  if (svc_ctx.threads)
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+
   // Encode frames
   while (!end_of_stream) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
     if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
       // We need one extra vpx_svc_encode call at end of stream to flush
       // encoder and get remaining data
@@ -325,47 +644,149 @@
     }
 
     res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
-                         pts, frame_duration, VPX_DL_GOOD_QUALITY);
+                         pts, frame_duration, svc_ctx.speed >= 5 ?
+                         VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
+
     printf("%s", vpx_svc_get_message(&svc_ctx));
     if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
     }
-    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
-        vpx_video_writer_write_frame(writer,
-                                     vpx_svc_get_buffer(&svc_ctx),
-                                     frame_size, pts);
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+      switch (cx_pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          if (cx_pkt->data.frame.sz > 0) {
+#if OUTPUT_RC_STATS
+            uint32_t sizes[8];
+            int count = 0;
+#endif
+            vpx_video_writer_write_frame(writer,
+                                         cx_pkt->data.frame.buf,
+                                         cx_pkt->data.frame.sz,
+                                         cx_pkt->data.frame.pts);
+#if OUTPUT_RC_STATS
+            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            if (svc_ctx.output_rc_stat) {
+              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
+              parse_superframe_index(cx_pkt->data.frame.buf,
+                                     cx_pkt->data.frame.sz, sizes, &count);
+              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                        layer_id.temporal_layer_id];
+              }
+              for (tl = layer_id.temporal_layer_id;
+                  tl < enc_cfg.ts_number_layers; ++tl) {
+                vpx_video_writer_write_frame(outfile[tl],
+                                             cx_pkt->data.frame.buf,
+                                             cx_pkt->data.frame.sz,
+                                             cx_pkt->data.frame.pts);
+              }
+
+              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                for (tl = layer_id.temporal_layer_id;
+                    tl < enc_cfg.ts_number_layers; ++tl) {
+                  const int layer = sl * enc_cfg.ts_number_layers + tl;
+                  ++rc.layer_tot_enc_frames[layer];
+                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+                  // Keep count of rate control stats per layer, for non-key
+                  // frames.
+                  if (tl == layer_id.temporal_layer_id &&
+                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+                    rc.layer_avg_rate_mismatch[layer] +=
+                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
+                        rc.layer_pfb[layer];
+                    ++rc.layer_enc_frames[layer];
+                  }
+                }
+              }
+
+              // Update for short-time encoding bitrate states, for moving
+              // window of size rc->window, shifted by rc->window / 2.
+              // Ignore first window segment, due to key frame.
+              if (frame_cnt > rc.window_size) {
+                tl = layer_id.temporal_layer_id;
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+                }
+                if (frame_cnt % rc.window_size == 0) {
+                  rc.window_count += 1;
+                  rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+                  rc.variance_st_encoding_bitrate +=
+                      (sum_bitrate / rc.window_size) *
+                      (sum_bitrate / rc.window_size);
+                  sum_bitrate = 0.0;
+                }
+              }
+
+              // Second shifted window.
+              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+               tl = layer_id.temporal_layer_id;
+               for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                 sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
+               }
+
+               if (frame_cnt > 2 * rc.window_size &&
+                  frame_cnt % rc.window_size == 0) {
+                 rc.window_count += 1;
+                 rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+                 rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate2 / rc.window_size) *
+                    (sum_bitrate2 / rc.window_size);
+                 sum_bitrate2 = 0.0;
+               }
+              }
+            }
+#endif
+          }
+
+          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
+                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
+                 (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          ++frames_received;
+          break;
+        }
+        case VPX_CODEC_STATS_PKT: {
+          stats_write(&app_input.rc_stats,
+                      cx_pkt->data.twopass_stats.buf,
+                      cx_pkt->data.twopass_stats.sz);
+          break;
+        }
+        default: {
+          break;
+        }
       }
     }
-    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
-      stats_write(&app_input.rc_stats,
-                  vpx_svc_get_rc_stats_buffer(&svc_ctx),
-                  vpx_svc_get_rc_stats_buffer_size(&svc_ctx));
-    }
+
     if (!end_of_stream) {
       ++frame_cnt;
       pts += frame_duration;
     }
   }
-
   printf("Processed %d frames\n", frame_cnt);
-
   fclose(infile);
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    printout_rate_control_summary(&rc, &enc_cfg, frame_cnt);
+    printf("\n");
+  }
+#endif
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
-
   if (app_input.passes == 2)
     stats_close(&app_input.rc_stats, 1);
-
   if (writer) {
     vpx_video_writer_close(writer);
   }
-
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      vpx_video_writer_close(outfile[tl]);
+    }
+  }
+#endif
   vpx_img_free(&raw);
-
   // display average size, psnr
   printf("%s", vpx_svc_dump_statistics(&svc_ctx));
-
   vpx_svc_release(&svc_ctx);
-
   return EXIT_SUCCESS;
 }

diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c
index be3e7b2..ee7de6b 100644
--- a/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libvpx/examples/vpx_temporal_svc_encoder.c

@@ -12,23 +12,23 @@
 //  encoding scheme based on temporal scalability for video applications
 //  that benefit from a scalable bitstream.
 
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
-#include "vpx_ports/vpx_timer.h"
+#include "../vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
@@ -37,7 +37,8 @@
   kDenoiserOff,
   kDenoiserOnYOnly,
   kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive  // Aggressive mode not implemented currently.
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
 };
 
 static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
@@ -60,6 +61,16 @@
   double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
   // Actual encoding bitrate per layer (cumulative).
   double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
 };
 
 // Note: these rate control metrics assume only 1 key frame in the
@@ -75,13 +86,13 @@
   // per-frame-bandwidth, for the rate control encoding stats below.
   const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
   rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
-  rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] /
+  rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] /
       rc->layer_framerate[0];
   for (i = 0; i < cfg->ts_number_layers; ++i) {
     if (i > 0) {
       rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
       rc->layer_pfb[i] = 1000.0 *
-          (cfg->ts_target_bitrate[i] - cfg->ts_target_bitrate[i - 1]) /
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
           (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
     }
     rc->layer_input_frames[i] = 0;
@@ -91,6 +102,10 @@
     rc->layer_avg_frame_size[i] = 0.0;
     rc->layer_avg_rate_mismatch[i] = 0.0;
   }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
 }
 
 static void printout_rate_control_summary(struct RateControlMetrics *rc,
@@ -98,6 +113,7 @@
                                           int frame_cnt) {
   unsigned int i = 0;
   int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
   printf("Total number of processed frames: %d\n\n", frame_cnt -1);
   printf("Rate control layer stats for %d layer(s):\n\n",
       cfg->ts_number_layers);
@@ -113,7 +129,7 @@
     rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
         rc->layer_enc_frames[i];
     printf("For layer#: %d \n", i);
-    printf("Bitrate (target vs actual): %d %f \n", cfg->ts_target_bitrate[i],
+    printf("Bitrate (target vs actual): %d %f \n", rc->layer_target_bitrate[i],
            rc->layer_encoding_bitrate[i]);
     printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i],
            rc->layer_avg_frame_size[i]);
@@ -124,6 +140,17 @@
         100.0 * num_dropped / rc->layer_input_frames[i]);
     printf("\n");
   }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n",rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
   if ((frame_cnt - 1) != tot_num_frames)
     die("Error: Number of input frames not equal to output! \n");
 }
@@ -437,7 +464,7 @@
 }
 
 int main(int argc, char **argv) {
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS];
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   int frame_cnt = 0;
@@ -455,19 +482,39 @@
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
   int flag_periodicity = 1;
-  int max_intra_size_pct;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
   vpx_svc_layer_id_t layer_id = {0, 0};
+#else
+  vpx_svc_layer_id_t layer_id = {0};
+#endif
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
+  const int min_args_base = 11;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_bit_depth_t bit_depth = VPX_BITS_8;
+  int input_bit_depth = 8;
+  const int min_args = min_args_base + 1;
+#else
+  const int min_args = min_args_base;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
 
   exec_name = argv[0];
   // Check usage and arguments.
-  if (argc < 11) {
+  if (argc < min_args) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n", argv[0]);
+#else
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
         "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> \n", argv[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   encoder = get_vpx_encoder_by_name(argv[3]);
@@ -487,13 +534,38 @@
     die("Invalid layering mode (0..12) %s", argv[10]);
   }
 
-  if (argc != 11 + mode_to_num_layers[layering_mode]) {
+  if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (strtol(argv[argc-1], NULL, 0)) {
+    case 8:
+      bit_depth = VPX_BITS_8;
+      input_bit_depth = 8;
+      break;
+    case 10:
+      bit_depth = VPX_BITS_10;
+      input_bit_depth = 10;
+      break;
+    case 12:
+      bit_depth = VPX_BITS_12;
+      input_bit_depth = 12;
+      break;
+    default:
+      die("Invalid bit depth (8, 10, 12) %s", argv[argc-1]);
+  }
+  if (!vpx_img_alloc(&raw,
+                     bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 :
+                                               VPX_IMG_FMT_I42016,
+                     width, height, 32)) {
+    die("Failed to allocate image", width, height);
+  }
+#else
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
     die("Failed to allocate image", width, height);
   }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Populate encoder configuration.
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
@@ -506,6 +578,14 @@
   cfg.g_w = width;
   cfg.g_h = height;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (bit_depth != VPX_BITS_8) {
+    cfg.g_bit_depth = bit_depth;
+    cfg.g_input_bit_depth = input_bit_depth;
+    cfg.g_profile = 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
   cfg.g_timebase.num = strtol(argv[6], NULL, 0);
   cfg.g_timebase.den = strtol(argv[7], NULL, 0);
@@ -515,22 +595,35 @@
     die("Invalid speed setting: must be positive");
   }
 
-  for (i = 11; (int)i < 11 + mode_to_num_layers[layering_mode]; ++i) {
-    cfg.ts_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+  for (i = min_args_base;
+       (int)i < min_args_base + mode_to_num_layers[layering_mode];
+       ++i) {
+    rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+    if (strncmp(encoder->name, "vp8", 3) == 0)
+      cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+    else if (strncmp(encoder->name, "vp9", 3) == 0)
+      cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
   }
 
   // Real time parameters.
   cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
   cfg.rc_end_usage = VPX_CBR;
-  cfg.rc_resize_allowed = 0;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
+  if (strncmp(encoder->name, "vp9", 3) == 0)
+    cfg.rc_max_quantizer = 52;
   cfg.rc_undershoot_pct = 50;
   cfg.rc_overshoot_pct = 50;
   cfg.rc_buf_initial_sz = 500;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
+  // Disable dynamic resizing by default.
+  cfg.rc_resize_allowed = 0;
+
+  // Use 1 thread as default.
+  cfg.g_threads = 1;
+
   // Enable error resilient mode.
   cfg.g_error_resilient = 1;
   cfg.g_lag_in_frames   = 0;
@@ -539,6 +632,8 @@
   // Disable automatic keyframe placement.
   cfg.kf_min_dist = cfg.kf_max_dist = 3000;
 
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
   set_temporal_layer_pattern(layering_mode,
                              &cfg,
                              layer_flags,
@@ -547,14 +642,15 @@
   set_rate_control_metrics(&rc, &cfg);
 
   // Target bandwidth for the whole stream.
-  // Set to ts_target_bitrate for highest layer (total bitrate).
-  cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1];
+  // Set to layer_target_bitrate for highest layer (total bitrate).
+  cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1];
 
   // Open input file.
   if (!(infile = fopen(argv[1], "rb"))) {
     die("Failed to open %s for reading", argv[1]);
   }
 
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
   // Open an output file for each stream.
   for (i = 0; i < cfg.ts_number_layers; ++i) {
     char file_name[PATH_MAX];
@@ -569,50 +665,78 @@
     outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
     if (!outfile[i])
       die("Failed to open %s for writing", file_name);
+
+    assert(outfile[i] != NULL);
   }
   // No spatial layers in this encoder.
   cfg.ss_number_layers = 1;
 
   // Initialize codec.
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (vpx_codec_enc_init(
+          &codec, encoder->codec_interface(), &cfg,
+          bit_depth == VPX_BITS_8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH))
+#else
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     die_codec(&codec, "Failed to initialize encoder");
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
-      vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
-      vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
-      vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-      vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, 0);
-      if (vpx_codec_control(&codec, VP9E_SET_SVC, 1)) {
-        die_codec(&codec, "Failed to set SVC");
+    vpx_svc_extra_cfg_t svc_params;
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
+      die_codec(&codec, "Failed to set SVC");
+    for (i = 0; i < cfg.ts_number_layers; ++i) {
+      svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+      svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
     }
+    svc_params.scaling_factor_num[0] = cfg.g_h;
+    svc_params.scaling_factor_den[0] = cfg.g_h;
+    vpx_codec_control(&codec, VP9E_SET_SVC_PARAMETERS, &svc_params);
   }
-  vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  if (strncmp(encoder->name, "vp8", 3) == 0) {
+    vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0);
+  }
   vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
-  max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
-      * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
-  // For low-quality key frame.
-  max_intra_size_pct = 200;
-  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+  {
+    const int max_intra_size_pct = 900;
+    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
 
   frame_avail = 1;
   while (frame_avail || got_data) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
+#endif
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
     if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+    } else if (strncmp(encoder->name, "vp8", 3) == 0) {
+      vpx_codec_control(&codec, VP8E_SET_TEMPORAL_LAYER_ID,
+                        layer_id.temporal_layer_id);
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
+    if (layering_mode == 0)
+      flags = 0;
     frame_avail = vpx_img_read(&raw, infile);
     if (frame_avail)
       ++rc.layer_input_frames[layer_id.temporal_layer_id];
@@ -648,6 +772,33 @@
               ++rc.layer_enc_frames[i];
             }
           }
+          // Update for short-time encoding bitrate states, for moving window
+          // of size rc->window, shifted by rc->window / 2.
+          // Ignore first window segment, due to key frame.
+          if (frame_cnt > rc.window_size) {
+            sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate / rc.window_size) *
+                  (sum_bitrate / rc.window_size);
+              sum_bitrate = 0.0;
+            }
+          }
+          // Second shifted window.
+          if (frame_cnt > rc.window_size + rc.window_size / 2) {
+            sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt > 2 * rc.window_size &&
+                frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate2 / rc.window_size) *
+                  (sum_bitrate2 / rc.window_size);
+              sum_bitrate2 = 0.0;
+            }
+          }
           break;
           default:
             break;

diff --git a/libvpx/libs.doxy_template b/libvpx/libs.doxy_template
index 02e2902..5a8f847 100644
--- a/libvpx/libs.doxy_template
+++ b/libvpx/libs.doxy_template

@@ -36,7 +36,7 @@
 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded
 # by quotes) that should identify the project.
 
-PROJECT_NAME           = "WebM VP8 Codec SDK"
+PROJECT_NAME           = "WebM Codec SDK"
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -415,12 +415,6 @@
 
 SHOW_USED_FILES        = YES
 
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from the
 # version control system). Doxygen will invoke the program by executing (via
@@ -715,12 +709,6 @@
 
 HTML_STYLESHEET        =
 
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
 # If the GENERATE_HTMLHELP tag is set to YES, additional index files
 # will be generated that can be used as input for tools like the
 # Microsoft HTML help workshop to generate a compressed HTML help file (.chm)

diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index 25fbc2c..b9d4b28 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk

@@ -18,32 +18,6 @@
 endif
 
 #
-# Calculate platform- and compiler-specific offsets for hand coded assembly
-#
-ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
-define asm_offsets_template
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S
-	@echo "    [CREATE] $$@"
-	$$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@
-$$(BUILD_PFX)$(2).S: $(2)
-CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S
-endef
-else
-  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
-define asm_offsets_template
-$$(BUILD_PFX)$(1): obj_int_extract
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o
-	@echo "    [CREATE] $$@"
-	$$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@
-OBJS-yes += $$(BUILD_PFX)$(2).o
-CLEAN-OBJS += $$(BUILD_PFX)$(1)
-$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1)
-endef
-endif # rvct
-endif # !gcc
-
-#
 # Rule to generate runtime cpu detection files
 #
 define rtcd_h_template
@@ -51,7 +25,7 @@
 	@echo "    [CREATE] $$@"
 	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \
           --sym=$(1) \
-          --config=$$(CONFIG_DIR)$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \
+          --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \
           $$(RTCD_OPTIONS) $$^ > $$@
 CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
@@ -60,13 +34,6 @@
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk
 
-# If this is a universal (fat) binary, then all the subarchitectures have
-# already been built and our job is to stitch them together. The
-# BUILD_LIBVPX variable indicates whether we should be building
-# (compiling, linking) the library. The LIPO_LIBVPX variable indicates
-# that we're stitching.
-$(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
-
 include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
 CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
 CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))
@@ -80,6 +47,12 @@
 include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
 CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
 
+include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
+CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
+CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))
+
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
   VP8_PREFIX=vp8/
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@@ -133,6 +106,42 @@
   CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
+
+#  VP10 make file
+ifneq ($(CONFIG_VP10_ENCODER)$(CONFIG_VP10_DECODER),)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
+endif
+
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_CX_EXPORTS))
+  CODEC_SRCS-yes += $(VP10_PREFIX)vp10cx.mk vpx/vp8.h vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
+
+ifeq ($(CONFIG_VP10_DECODER),yes)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_DX_EXPORTS))
+  CODEC_SRCS-yes += $(VP10_PREFIX)vp10dx.mk vpx/vp8.h vpx/vp8dx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_decoder
+endif
+
+VP10_PREFIX=vp10/
+$(BUILD_PFX)$(VP10_PREFIX)%.c.o: CFLAGS += -Wextra
 
 ifeq ($(CONFIG_ENCODERS),yes)
   CODEC_DOC_SECTIONS += encoder
@@ -161,18 +170,18 @@
 INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
 endif
 
-CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.pl
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h
-CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
-CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
+CODEC_SRCS-yes += build/make/version.sh
+CODEC_SRCS-yes += build/make/rtcd.pl
+CODEC_SRCS-yes += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-yes += vpx_ports/mem_ops.h
+CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h
+CODEC_SRCS-yes += vpx_ports/vpx_once.h
+CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
-CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
+CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
@@ -203,33 +212,13 @@
 # based build systems.
 libvpx_srcs.txt:
 	@echo "    [CREATE] $@"
-	@echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt
 
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
-obj_int_extract.bat: $(SRC_PATH_BARE)/build/$(MSVS_ARCH_DIR)/obj_int_extract.bat
-	@cp $^ $@
-
-obj_int_extract.$(VCPROJ_SFX): obj_int_extract.bat
-obj_int_extract.$(VCPROJ_SFX): $(SRC_PATH_BARE)/build/make/obj_int_extract.c
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-    --exe \
-    --target=$(TOOLCHAIN) \
-    --name=obj_int_extract \
-    --ver=$(CONFIG_VS_VERSION) \
-    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
-    --src-path-bare="$(SRC_PATH_BARE)" \
-    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-    --out=$@ $^ \
-    -I. \
-    -I"$(SRC_PATH_BARE)" \
-
-PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.$(VCPROJ_SFX)
-
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -244,7 +233,7 @@
     vpx_config.asm \
     vpx_ports/x86_abi_support.asm \
 
-vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def obj_int_extract.$(VCPROJ_SFX)
+vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
@@ -259,7 +248,7 @@
             $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
-PROJECTS-$(BUILD_LIBVPX) += vpx.$(VCPROJ_SFX)
+PROJECTS-yes += vpx.$(VCPROJ_SFX)
 
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
@@ -267,32 +256,42 @@
 endif
 else
 LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
-OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)
-LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
+OBJS-yes += $(LIBVPX_OBJS)
+LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-
-BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
-
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO               := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
 EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.dylib  )
 else
-LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
+SHARED_LIB_SUF          := _dll.a
+EXPORT_FILE             := libvpx.def
+LIBVPX_SO_SYMLINKS      :=
+LIBVPX_SO_IMPLIB        := libvpx_dll.a
+else
+LIBVPX_SO               := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
+SHARED_LIB_SUF          := .so
 EXPORT_FILE             := libvpx.ver
-SYM_LINK                := libvpx.so
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libvpx.so libvpx.so.$(VERSION_MAJOR) \
-                             libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+                             libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
+endif
 endif
 
-LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
-                           $(notdir $(LIBVPX_SO_SYMLINKS))
+LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
+                           $(notdir $(LIBVPX_SO_SYMLINKS)) \
+                           $(if $(LIBVPX_SO_IMPLIB), $(BUILD_PFX)$(LIBVPX_SO_IMPLIB))
 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
 $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
 libvpx.ver: $(call enabled,CODEC_EXPORTS)
@@ -307,6 +306,19 @@
 	$(qexec)awk '{print "_"$$2}' $^ >$@
 CLEAN-OBJS += libvpx.syms
 
+libvpx.def: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
+	$(qexec)echo "DATA MULTIPLE NONSHARED" >> $@
+	$(qexec)echo "EXPORTS" >> $@
+	$(qexec)awk '!/vpx_svc_*/ {print "_"$$2}' $^ >>$@
+CLEAN-OBJS += libvpx.def
+
+libvpx_dll.a: $(LIBVPX_SO)
+	@echo "    [IMPLIB] $@"
+	$(qexec)emximp -o $@ $<
+CLEAN-OBJS += libvpx_dll.a
+
 define libvpx_symlink_template
 $(1): $(2)
 	@echo "    [LN]     $(2) $$@"
@@ -322,11 +334,12 @@
     $(LIBVPX_SO)))
 
 
-INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBVPX_SO_SYMLINKS)
-INSTALL-LIBS-$(BUILD_LIBVPX_SO) += $(LIBSUBDIR)/$(LIBVPX_SO)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBVPX_SO_IMPLIB),$(LIBSUBDIR)/$(LIBVPX_SO_IMPLIB))
 
 
-LIBS-$(BUILD_LIBVPX) += vpx.pc
+LIBS-yes += vpx.pc
 vpx.pc: config.mk libs.mk
 	@echo "    [CREATE] $@"
 	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
@@ -352,9 +365,6 @@
 CLEAN-OBJS += vpx.pc
 endif
 
-LIBS-$(LIPO_LIBVPX) += libvpx.a
-$(eval $(if $(LIPO_LIBVPX),$(call lipo_lib_template,libvpx.a)))
-
 #
 # Rule to make assembler configuration file from C configuration file
 #
@@ -375,7 +385,7 @@
 endif
 
 #
-# Add assembler dependencies for configuration and offsets
+# Add assembler dependencies for configuration.
 #
 $(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
@@ -393,14 +403,18 @@
 
 include $(SRC_PATH_BARE)/test/test.mk
 LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
-LIBVPX_TEST_BINS=./test_libvpx$(EXE_SFX)
+LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
 
+TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
+TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
+TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
+
 libvpx_test_srcs.txt:
 	@echo "    [CREATE] $@"
-	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_test_srcs.txt
 
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
@@ -461,7 +475,25 @@
 
 PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX)
 
-LIBVPX_TEST_BINS := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BINS)))
+LIBVPX_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BIN)))
+
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX)
+test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_intra_pred_speed \
+            -D_VARIADIC_MAX=10 \
+            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
+endif  # TEST_INTRA_PRED_SPEED
 endif
 else
 
@@ -472,45 +504,54 @@
 # Disabling pthreads globally will cause issues on darwin and possibly elsewhere
 $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
 endif
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
-OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
-LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
+GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src
+GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(GTEST_OBJS)
+LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
 $(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS)
 
 LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS)))
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
-$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
-OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS)
-BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS)
+$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(LIBVPX_TEST_OBJS)
+BINS-yes += $(LIBVPX_TEST_BIN)
 
 CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
-CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
-$(foreach bin,$(LIBVPX_TEST_BINS),\
-    $(if $(BUILD_LIBVPX),$(eval $(bin): \
-        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
-    $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
-        $(LIBVPX_TEST_OBJS) \
-        -L. -lvpx -lgtest $(extralibs) -lm)\
-        )))\
-    $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
+TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a
+$(LIBVPX_TEST_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(LIBVPX_TEST_BIN), \
+              $(LIBVPX_TEST_OBJS) \
+              -L. -lvpx -lgtest $(extralibs) -lm))
 
-endif
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS)
+BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN)
+
+$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
+              $(TEST_INTRA_PRED_SPEED_OBJS) \
+              -L. -lvpx -lgtest $(extralibs) -lm))
+endif  # TEST_INTRA_PRED_SPEED
+
+endif  # CONFIG_UNIT_TESTS
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
     $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
 
 define test_shard_template
 test:: test_shard.$(1)
-test_shard.$(1): $(LIBVPX_TEST_BINS) testdata
+test-no-data-check:: test_shard_ndc.$(1)
+test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
-	 for t in $(LIBVPX_TEST_BINS); do \
-	   export GTEST_SHARD_INDEX=$(1); \
-	   export GTEST_TOTAL_SHARDS=$(2); \
-	   $$$$t; \
-	 done
+	 export GTEST_SHARD_INDEX=$(1); \
+	 export GTEST_TOTAL_SHARDS=$(2); \
+	 $(LIBVPX_TEST_BIN)
+test_shard.$(1): testdata
 .PHONY: test_shard.$(1)
 endef
 
@@ -529,12 +570,15 @@
 	@echo "    [CREATE] $@"
 	@rm -f $@
 	@echo "INPUT += $^" >> $@
-	@echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@
 	@echo "INCLUDE_PATH += ." >> $@;
 	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
 
 ## Generate rtcd.h for all objects
+ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
 $(OBJS-yes:.o=.d): $(RTCD)
+else
+$(OBJS-yes): $(RTCD)
+endif
 
 ## Update the global src list
 SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS)
@@ -552,15 +596,16 @@
 # TODO(tomfinegan): Support running the debug versions of tools?
 TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
-utiltest: testdata
+utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(TEST_BIN_PATH)
+utiltest: testdata
 else
-utiltest:
+utiltest utiltest-no-data-check:
 	@echo Unit tests must be enabled to make the utiltest target.
 endif
 
@@ -578,11 +623,12 @@
 # TODO(tomfinegan): Support running the debug versions of tools?
 EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
-exampletest: examples testdata
+exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
 		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
 		--bin-path $(EXAMPLES_BIN_PATH)
+exampletest: testdata
 else
-exampletest:
+exampletest exampletest-no-data-check:
 	@echo Unit tests must be enabled to make the exampletest target.
 endif

diff --git a/libvpx/mainpage.dox b/libvpx/mainpage.dox
index e2ec280..ec202fa 100644
--- a/libvpx/mainpage.dox
+++ b/libvpx/mainpage.dox

@@ -1,4 +1,4 @@
-/*!\mainpage WebM VP8 Codec SDK
+/*!\mainpage WebM Codec SDK
 
   \section main_contents Page Contents
   - \ref main_intro
@@ -6,11 +6,11 @@
   - \ref main_support
 
   \section main_intro Introduction
-  Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your
-  applications with the VP8 video codec, a high quality, royalty free, open
-  source codec deployed on millions of computers and devices worldwide.
+  Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+  applications with the VP8 and VP9 video codecs, high quality, royalty free,
+  open source codecs deployed on billions of computers and devices worldwide.
 
-  This distribution of the WebM VP8 Codec SDK includes the following support:
+  This distribution of the WebM Codec SDK includes the following support:
 
   \if vp8_encoder
   - \ref vp8_encoder
@@ -28,12 +28,12 @@
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
   - \ref codec reference
-    \if encoder
-    - \ref encoder reference
-    \endif
-    \if decoder
-    - \ref decoder reference
-    \endif
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
 
   \section main_support Support Options & FAQ
   The WebM project is an open source project supported by its community. For

diff --git a/libvpx/md5_utils.c b/libvpx/md5_utils.c
index 8fb26e2..f4f893a 100644
--- a/libvpx/md5_utils.c
+++ b/libvpx/md5_utils.c

@@ -24,7 +24,7 @@
 
 #include "md5_utils.h"
 
-void
+static void
 byteSwap(UWORD32 *buf, unsigned words) {
   md5byte *p;
 

diff --git a/libvpx/rate_hist.c b/libvpx/rate_hist.c
index 1cef19b..a77222b 100644
--- a/libvpx/rate_hist.c
+++ b/libvpx/rate_hist.c

@@ -88,6 +88,9 @@
   if (now < cfg->rc_buf_initial_sz)
     return;
 
+  if (!cfg->rc_target_bitrate)
+    return;
+
   then = now;
 
   /* Sum the size over the past rc_buf_sz ms */

diff --git a/libvpx/solution.mk b/libvpx/solution.mk
index 2c8d29a..145adc0 100644
--- a/libvpx/solution.mk
+++ b/libvpx/solution.mk

@@ -9,7 +9,7 @@
 ##
 
 # libvpx reverse dependencies (targets that depend on libvpx)
-VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest obj_int_extract)
+VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest)
 VPX_RDEPS=$(foreach vcp,\
               $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):vpx)
 
@@ -17,7 +17,6 @@
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
             $(if $(filter vpx.$(VCPROJ_SFX),$^),$(VPX_RDEPS)) \
-            --dep=vpx:obj_int_extract \
             --dep=test_libvpx:gtest \
             --ver=$(CONFIG_VS_VERSION)\
             --out=$@ $^

diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h
index 496dae3..ff5c93e 100644
--- a/libvpx/test/acm_random.h
+++ b/libvpx/test/acm_random.h

@@ -29,14 +29,14 @@
   uint16_t Rand16(void) {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
-    return (value >> 16) & 0xffff;
+    return (value >> 15) & 0xffff;
   }
 
   uint8_t Rand8(void) {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
-    return (value >> 24) & 0xff;
+    return (value >> 23) & 0xff;
   }
 
   uint8_t Rand8Extremes(void) {

diff --git a/libvpx/test/active_map_test.cc b/libvpx/test/active_map_test.cc
index a9bb540..0221995 100644
--- a/libvpx/test/active_map_test.cc
+++ b/libvpx/test/active_map_test.cc

@@ -38,7 +38,7 @@
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
     } else if (video->frame() == 3) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       uint8_t active_map[9 * 13] = {
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
@@ -57,7 +57,7 @@
       map.active_map = active_map;
       encoder->Control(VP8E_SET_ACTIVEMAP, &map);
     } else if (video->frame() == 15) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
       map.active_map = NULL;

diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk
index 4e750b2..48872a2 100644
--- a/libvpx/test/android/Android.mk
+++ b/libvpx/test/android/Android.mk

@@ -40,9 +40,17 @@
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libvpx_test
 LOCAL_STATIC_LIBRARIES := gtest libwebm
-LOCAL_SHARED_LIBRARIES := vpx
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := vpx
+else
+  LOCAL_STATIC_LIBRARIES += vpx
+endif
+
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
 LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+# some test files depend on *_rtcd.h, ensure they're generated first.
+$(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)

diff --git a/libvpx/test/blockiness_test.cc b/libvpx/test/blockiness_test.cc
new file mode 100644
index 0000000..0c60baa
--- /dev/null
+++ b/libvpx/test/blockiness_test.cc

@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+
+extern "C"
+double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch,
+                          int width, int height);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class BlockinessTestBase : public ::testing::Test {
+ public:
+  BlockinessTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+    vpx_free(reference_data_);
+    reference_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant,
+                    int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = fill_constant;
+      }
+    }
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+    FillConstant(data, stride, fill_constant, width_, height_);
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void FillRandomBlocky(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        FillRandom(data + h * stride + w, stride, 4, 4);
+      }
+    }
+  }
+
+  void FillCheckerboard(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        if (((h/4) ^ (w/4)) & 1)
+          FillConstant(data + h * stride + w, stride, 255, 4, 4);
+        else
+          FillConstant(data + h * stride + w, stride, 0, 4, 4);
+      }
+    }
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+  static uint8_t* reference_data_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> BlockinessParam;
+class BlockinessVP9Test
+    : public BlockinessTestBase,
+      public ::testing::WithParamInterface<BlockinessParam> {
+ public:
+  BlockinessVP9Test() : BlockinessTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  int CheckBlockiness() {
+    return vp9_get_blockiness(source_data_, source_stride_,
+                              reference_data_, reference_stride_,
+                              width_, height_);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* BlockinessTestBase::source_data_ = NULL;
+uint8_t* BlockinessTestBase::reference_data_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(BlockinessVP9Test, SourceBlockierThanReference) {
+  // Source is blockier than reference.
+  FillRandomBlocky(source_data_, source_stride_);
+  FillConstant(reference_data_, reference_stride_, 128);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_EQ(0, super_blocky) << "Blocky source should produce 0 blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, ReferenceBlockierThanSource) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, 0.0)
+      << "Blocky reference should score high for blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, BlurringDecreasesBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillCheckerboard(reference_data_, reference_stride_);
+
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const BlockinessParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace

diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc
index b30be45..6592375 100644
--- a/libvpx/test/borders_test.cc
+++ b/libvpx/test/borders_test.cc

@@ -80,4 +80,7 @@
 
 VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
     ::libvpx_test::kTwoPassGood));
+
+VP10_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
 }  // namespace

diff --git a/libvpx/test/byte_alignment_test.cc b/libvpx/test/byte_alignment_test.cc
new file mode 100644
index 0000000..aa4b78b
--- /dev/null
+++ b/libvpx/test/byte_alignment_test.cc

@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kLegacyByteAlignment = 0;
+const int kLegacyYPlaneByteAlignment = 32;
+const int kNumPlanesToCheck = 3;
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5";
+
+#if CONFIG_WEBM_IO
+
+struct ByteAlignmentTestParam {
+  int byte_alignment;
+  vpx_codec_err_t expected_value;
+  bool decode_remaining;
+};
+
+const ByteAlignmentTestParam kBaTestParams[] = {
+  {kLegacyByteAlignment, VPX_CODEC_OK, true},
+  {32, VPX_CODEC_OK, true},
+  {64, VPX_CODEC_OK, true},
+  {128, VPX_CODEC_OK, true},
+  {256, VPX_CODEC_OK, true},
+  {512, VPX_CODEC_OK, true},
+  {1024, VPX_CODEC_OK, true},
+  {1, VPX_CODEC_INVALID_PARAM, false},
+  {-2, VPX_CODEC_INVALID_PARAM, false},
+  {4, VPX_CODEC_INVALID_PARAM, false},
+  {16, VPX_CODEC_INVALID_PARAM, false},
+  {255, VPX_CODEC_INVALID_PARAM, false},
+  {2048, VPX_CODEC_INVALID_PARAM, false},
+};
+
+// Class for testing byte alignment of reference buffers.
+class ByteAlignmentTest
+    : public ::testing::TestWithParam<ByteAlignmentTestParam> {
+ protected:
+  ByteAlignmentTest()
+      : video_(NULL),
+        decoder_(NULL),
+        md5_file_(NULL) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+
+    OpenMd5File(kVP9Md5File);
+  }
+
+  virtual void TearDown() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+
+    delete decoder_;
+    delete video_;
+  }
+
+  void SetByteAlignment(int byte_alignment, vpx_codec_err_t expected_value) {
+    decoder_->Control(VP9_SET_BYTE_ALIGNMENT, byte_alignment, expected_value);
+  }
+
+  vpx_codec_err_t DecodeOneFrame(int byte_alignment_to_check) {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames(byte_alignment_to_check);
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      CheckDecodedFrames(byte_alignment_to_check);
+    }
+    return VPX_CODEC_OK;
+  }
+
+ private:
+  // Check if |data| is aligned to |byte_alignment_to_check|.
+  // |byte_alignment_to_check| must be a power of 2.
+  void CheckByteAlignment(const uint8_t *data, int byte_alignment_to_check) {
+    ASSERT_EQ(0u, reinterpret_cast<size_t>(data) % byte_alignment_to_check);
+  }
+
+  // Iterate through the planes of the decoded frames and check for
+  // alignment based off |byte_alignment_to_check|.
+  void CheckDecodedFrames(int byte_alignment_to_check) {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      if (byte_alignment_to_check == kLegacyByteAlignment) {
+        CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment);
+      } else {
+        for (int i = 0; i < kNumPlanesToCheck; ++i) {
+          CheckByteAlignment(img->planes[i], byte_alignment_to_check);
+        }
+      }
+      CheckMd5(*img);
+    }
+  }
+
+  // TODO(fgalligan): Move the MD5 testing code into another class.
+  void OpenMd5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  void CheckMd5(const vpx_image_t &img) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5) << "MD5 checksums don't match";
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  FILE *md5_file_;
+};
+
+TEST_F(ByteAlignmentTest, SwitchByteAlignment) {
+  const int num_elements = 14;
+  const int byte_alignments[] = { 0, 32, 64, 128, 256, 512, 1024,
+                                  0, 1024, 32, 512, 64, 256, 128 };
+
+  for (int i = 0; i < num_elements; ++i) {
+    SetByteAlignment(byte_alignments[i], VPX_CODEC_OK);
+    ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame(byte_alignments[i]));
+  }
+  SetByteAlignment(byte_alignments[0], VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(byte_alignments[0]));
+}
+
+TEST_P(ByteAlignmentTest, TestAlignment) {
+  const ByteAlignmentTestParam t = GetParam();
+  SetByteAlignment(t.byte_alignment, t.expected_value);
+  if (t.decode_remaining)
+    ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment));
+}
+
+INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest,
+                        ::testing::ValuesIn(kBaTestParams));
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace

diff --git a/libvpx/test/codec_factory.h b/libvpx/test/codec_factory.h
index 7f9398c..09c9cf9 100644
--- a/libvpx/test/codec_factory.h
+++ b/libvpx/test/codec_factory.h

@@ -13,10 +13,10 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vpx_encoder.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -35,6 +35,11 @@
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const = 0;
 
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline)  // NOLINT(runtime/int)
+                                 const = 0;
+
   virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
                                  unsigned long deadline,
                                  const unsigned long init_flags,
@@ -72,6 +77,10 @@
   VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
       : Decoder(cfg, deadline) {}
 
+  VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP8_DECODER
@@ -104,8 +113,14 @@
 
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP8_DECODER
-    return new VP8Decoder(cfg, deadline);
+    return new VP8Decoder(cfg, flags, deadline);
 #else
     return NULL;
 #endif
@@ -154,6 +169,10 @@
   VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
       : Decoder(cfg, deadline) {}
 
+  VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP9_DECODER
@@ -186,8 +205,14 @@
 
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP9_DECODER
-    return new VP9Decoder(cfg, deadline);
+    return new VP9Decoder(cfg, flags, deadline);
 #else
     return NULL;
 #endif
@@ -208,6 +233,8 @@
                                                int usage) const {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
+#elif CONFIG_VP10_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
 #else
     return VPX_CODEC_INCAPABLE;
 #endif
@@ -226,7 +253,96 @@
 #define VP9_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP9
 
+/*
+ * VP10 Codec Definitions
+ */
+#if CONFIG_VP10
+class VP10Decoder : public Decoder {
+ public:
+  VP10Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : Decoder(cfg, deadline) {}
+
+  VP10Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+              unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_DECODER
+    return &vpx_codec_vp10_dx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP10Encoder : public Encoder {
+ public:
+  VP10Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+              const unsigned long init_flags, TwopassStatsStore *stats)
+      : Encoder(cfg, deadline, init_flags, stats) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_ENCODER
+    return &vpx_codec_vp10_cx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP10CodecFactory : public CodecFactory {
+ public:
+  VP10CodecFactory() : CodecFactory() {}
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
+#if CONFIG_VP10_DECODER
+    return new VP10Decoder(cfg, flags, deadline);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+                                 unsigned long deadline,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_VP10_ENCODER
+    return new VP10Encoder(cfg, deadline, init_flags, stats);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                               int usage) const {
+#if CONFIG_VP10_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libvpx_test::VP10CodecFactory kVP10;
+
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)\
+  INSTANTIATE_TEST_CASE_P(VP10, test, \
+      ::testing::Combine( \
+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
+               &libvpx_test::kVP10)), \
+          __VA_ARGS__))
+#else
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)
+#endif  // CONFIG_VP10
 
 }  // namespace libvpx_test
-
 #endif  // TEST_CODEC_FACTORY_H_

diff --git a/libvpx/test/consistency_test.cc b/libvpx/test/consistency_test.cc
new file mode 100644
index 0000000..9c2fd55
--- /dev/null
+++ b/libvpx/test/consistency_test.cc

@@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern "C"
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class ConsistencyTestBase : public ::testing::Test {
+ public:
+  ConsistencyTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    source_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    ssim_array_ = new Ssimv[kDataBufferSize / 16];
+  }
+
+  static void ClearSsim() {
+    memset(ssim_array_, 0, kDataBufferSize / 16);
+  }
+  static void TearDownTestCase() {
+    vpx_free(source_data_[0]);
+    source_data_[0] = NULL;
+    vpx_free(reference_data_[0]);
+    reference_data_[0] = NULL;
+    vpx_free(source_data_[1]);
+    source_data_[1] = NULL;
+    vpx_free(reference_data_[1]);
+    reference_data_[1] = NULL;
+
+    delete[] ssim_array_;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void Copy(uint8_t *reference, uint8_t *source) {
+    memcpy(reference, source, kDataBufferSize);
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_[2];
+  int source_stride_;
+  static uint8_t* reference_data_[2];
+  int reference_stride_;
+  static Ssimv *ssim_array_;
+  Metrics metrics_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> ConsistencyParam;
+class ConsistencyVP9Test
+    : public ConsistencyTestBase,
+      public ::testing::WithParamInterface<ConsistencyParam> {
+ public:
+  ConsistencyVP9Test() : ConsistencyTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  double CheckConsistency(int frame) {
+    EXPECT_LT(frame, 2)<< "Frame to check has to be less than 2.";
+    return
+        vpx_get_ssim_metrics(source_data_[frame], source_stride_,
+                             reference_data_[frame], reference_stride_,
+                             width_, height_, ssim_array_, &metrics_, 1);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* ConsistencyTestBase::source_data_[2] = {NULL, NULL};
+uint8_t* ConsistencyTestBase::reference_data_[2] = {NULL, NULL};
+Ssimv* ConsistencyTestBase::ssim_array_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Blur(reference_data_[0], reference_stride_, 3);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 3);
+
+  double inconsistency = CheckConsistency(1);
+  inconsistency = CheckConsistency(0);
+  EXPECT_EQ(inconsistency, 0.0)
+      << "Should have 0 inconsistency if they are exactly the same.";
+
+  // If sources are not consistent reference frames inconsistency should
+  // be less than if the source is consistent.
+  FillRandom(source_data_[0], source_stride_);
+  FillRandom(source_data_[1], source_stride_);
+  FillRandom(reference_data_[0], reference_stride_);
+  FillRandom(reference_data_[1], reference_stride_);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+
+  Copy(source_data_[1], source_data_[0]);
+  CheckConsistency(0);
+  double inconsistency2 = CheckConsistency(1);
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Should have less inconsistency if source itself is inconsistent.";
+
+  // Less of a blur should be less inconsistent than more blur coming off a
+  // a frame with no blur.
+  ClearSsim();
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 4);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+  ClearSsim();
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 8);
+  CheckConsistency(0);
+  inconsistency2 = CheckConsistency(1);
+
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Stronger Blur should produce more inconsistency.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const ConsistencyParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
+                        ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace

diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index 5b4a20e..e0e929e 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc

@@ -9,18 +9,27 @@
  */
 
 #include <string.h>
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
 namespace {
+
+static const unsigned int kMaxDimension = 64;
+
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
@@ -28,18 +37,34 @@
                              int w, int h);
 
 struct ConvolveFunctions {
-  ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg,
+  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg,
+                    ConvolveFunc h8, ConvolveFunc h8_avg,
                     ConvolveFunc v8, ConvolveFunc v8_avg,
-                    ConvolveFunc hv8, ConvolveFunc hv8_avg)
-      : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),
-        hv8_avg_(hv8_avg) {}
+                    ConvolveFunc hv8, ConvolveFunc hv8_avg,
+                    ConvolveFunc sh8, ConvolveFunc sh8_avg,
+                    ConvolveFunc sv8, ConvolveFunc sv8_avg,
+                    ConvolveFunc shv8, ConvolveFunc shv8_avg,
+                    int bd)
+      : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
+        v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
+        sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
+        use_highbd_(bd) {}
 
+  ConvolveFunc copy_;
+  ConvolveFunc avg_;
   ConvolveFunc h8_;
   ConvolveFunc v8_;
   ConvolveFunc hv8_;
   ConvolveFunc h8_avg_;
   ConvolveFunc v8_avg_;
   ConvolveFunc hv8_avg_;
+  ConvolveFunc sh8_;        // scaled horiz
+  ConvolveFunc sv8_;        // scaled vert
+  ConvolveFunc shv8_;       // scaled horiz/vert
+  ConvolveFunc sh8_avg_;    // scaled avg horiz
+  ConvolveFunc sv8_avg_;    // scaled avg vert
+  ConvolveFunc shv8_avg_;   // scaled avg horiz/vert
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
@@ -68,71 +93,66 @@
   const int kInterp_Extend = 4;
   const unsigned int intermediate_height =
       (kInterp_Extend - 1) + output_height + kInterp_Extend;
+  unsigned int i, j;
 
-  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
-   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
-   *                                 + kInterp_Extend
-   *                               = 3 + 16 + 4
-   *                               = 23
-   * and filter_max_width = 16
-   */
-  uint8_t intermediate_buffer[71 * 64];
+  // Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+  // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+  //                                 + kInterp_Extend
+  //                               = 3 + 16 + 4
+  //                               = 23
+  // and filter_max_width          = 16
+  //
+  uint8_t intermediate_buffer[71 * kMaxDimension];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
-  {
-    uint8_t *output_ptr = intermediate_buffer;
-    const int src_next_row_stride = src_stride - output_width;
-    unsigned int i, j;
-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < output_width; ++j) {
-        // Apply filter...
-        const int temp = (src_ptr[0] * HFilter[0]) +
-                         (src_ptr[1] * HFilter[1]) +
-                         (src_ptr[2] * HFilter[2]) +
-                         (src_ptr[3] * HFilter[3]) +
-                         (src_ptr[4] * HFilter[4]) +
-                         (src_ptr[5] * HFilter[5]) +
-                         (src_ptr[6] * HFilter[6]) +
-                         (src_ptr[7] * HFilter[7]) +
-                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+  uint8_t *output_ptr = intermediate_buffer;
+  const int src_next_row_stride = src_stride - output_width;
+  src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+  for (i = 0; i < intermediate_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * HFilter[0]) +
+          (src_ptr[1] * HFilter[1]) +
+          (src_ptr[2] * HFilter[2]) +
+          (src_ptr[3] * HFilter[3]) +
+          (src_ptr[4] * HFilter[4]) +
+          (src_ptr[5] * HFilter[5]) +
+          (src_ptr[6] * HFilter[6]) +
+          (src_ptr[7] * HFilter[7]) +
+          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
-        // Normalize back to 0-255...
-        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        ++src_ptr;
-        output_ptr += intermediate_height;
-      }
-      src_ptr += src_next_row_stride;
-      output_ptr += intermediate_next_stride;
+      // Normalize back to 0-255...
+      *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
+      ++src_ptr;
+      output_ptr += intermediate_height;
     }
+    src_ptr += src_next_row_stride;
+    output_ptr += intermediate_next_stride;
   }
 
   // Vertical pass (transposed intermediate -> dst).
-  {
-    uint8_t *src_ptr = intermediate_buffer;
-    const int dst_next_row_stride = dst_stride - output_width;
-    unsigned int i, j;
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; ++j) {
-        // Apply filter...
-        const int temp = (src_ptr[0] * VFilter[0]) +
-                         (src_ptr[1] * VFilter[1]) +
-                         (src_ptr[2] * VFilter[2]) +
-                         (src_ptr[3] * VFilter[3]) +
-                         (src_ptr[4] * VFilter[4]) +
-                         (src_ptr[5] * VFilter[5]) +
-                         (src_ptr[6] * VFilter[6]) +
-                         (src_ptr[7] * VFilter[7]) +
-                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+  src_ptr = intermediate_buffer;
+  const int dst_next_row_stride = dst_stride - output_width;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * VFilter[0]) +
+          (src_ptr[1] * VFilter[1]) +
+          (src_ptr[2] * VFilter[2]) +
+          (src_ptr[3] * VFilter[3]) +
+          (src_ptr[4] * VFilter[4]) +
+          (src_ptr[5] * VFilter[5]) +
+          (src_ptr[6] * VFilter[6]) +
+          (src_ptr[7] * VFilter[7]) +
+          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
-        // Normalize back to 0-255...
-        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        src_ptr += intermediate_height;
-      }
-      src_ptr += intermediate_next_stride;
-      dst_ptr += dst_next_row_stride;
+      // Normalize back to 0-255...
+      *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
+      src_ptr += intermediate_height;
     }
+    src_ptr += intermediate_next_stride;
+    dst_ptr += dst_next_row_stride;
   }
 }
 
@@ -159,16 +179,137 @@
                                 unsigned int dst_stride,
                                 unsigned int output_width,
                                 unsigned int output_height) {
-  uint8_t tmp[64 * 64];
+  uint8_t tmp[kMaxDimension * kMaxDimension];
 
-  assert(output_width <= 64);
-  assert(output_height <= 64);
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
   filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
                      output_width, output_height);
   block2d_average_c(tmp, 64, dst_ptr, dst_stride,
                     output_width, output_height);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter,
+                               const int16_t *VFilter,
+                               uint16_t *dst_ptr,
+                               unsigned int dst_stride,
+                               unsigned int output_width,
+                               unsigned int output_height,
+                               int bd) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+
+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+   *                                 + kInterp_Extend
+   *                               = 3 + 16 + 4
+   *                               = 23
+   * and filter_max_width = 16
+   */
+  uint16_t intermediate_buffer[71 * kMaxDimension];
+  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+
+  // Horizontal pass (src -> transposed intermediate).
+  {
+    uint16_t *output_ptr = intermediate_buffer;
+    const int src_next_row_stride = src_stride - output_width;
+    unsigned int i, j;
+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *output_ptr = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
+        ++src_ptr;
+        output_ptr += intermediate_height;
+      }
+      src_ptr += src_next_row_stride;
+      output_ptr += intermediate_next_stride;
+    }
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  {
+    uint16_t *src_ptr = intermediate_buffer;
+    const int dst_next_row_stride = dst_stride - output_width;
+    unsigned int i, j;
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *dst_ptr++ = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
+        src_ptr += intermediate_height;
+      }
+      src_ptr += intermediate_next_stride;
+      dst_ptr += dst_next_row_stride;
+    }
+  }
+}
+
+void highbd_block2d_average_c(uint16_t *src,
+                              unsigned int src_stride,
+                              uint16_t *output_ptr,
+                              unsigned int output_stride,
+                              unsigned int output_width,
+                              unsigned int output_height,
+                              int bd) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr,
+                                       const unsigned int src_stride,
+                                       const int16_t *HFilter,
+                                       const int16_t *VFilter,
+                                       uint16_t *dst_ptr,
+                                       unsigned int dst_stride,
+                                       unsigned int output_width,
+                                       unsigned int output_height,
+                                       int bd) {
+  uint16_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+                            output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+                           output_width, output_height, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
   static void SetUpTestCase() {
@@ -177,13 +318,36 @@
         vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
     output_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kOutputBufferSize));
+    output_ref_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
+#if CONFIG_VP9_HIGHBITDEPTH
+    input16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment,
+                     (kInputBufferSize + 1) * sizeof(uint16_t))) + 1;
+    output16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    output16_ref_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+#endif
   }
 
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
   static void TearDownTestCase() {
     vpx_free(input_ - 1);
     input_ = NULL;
     vpx_free(output_);
     output_ = NULL;
+    vpx_free(output_ref_);
+    output_ref_ = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(input16_ - 1);
+    input16_ = NULL;
+    vpx_free(output16_);
+    output16_ = NULL;
+    vpx_free(output16_ref_);
+    output16_ref_ = NULL;
+#endif
   }
 
  protected:
@@ -191,7 +355,6 @@
   static const int kOuterBlockSize = 256;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
-  static const int kMaxDimension = 64;
   static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
   static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 
@@ -212,6 +375,12 @@
 
   virtual void SetUp() {
     UUT_ = GET_PARAM(2);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ != 0)
+      mask_ = (1 << UUT_->use_highbd_) - 1;
+    else
+      mask_ = 255;
+#endif
     /* Set up guard blocks for an inner block centered in the outer block */
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i))
@@ -222,15 +391,32 @@
 
     ::libvpx_test::ACMRandom prng;
     for (int i = 0; i < kInputBufferSize; ++i) {
-      if (i & 1)
+      if (i & 1) {
         input_[i] = 255;
-      else
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = mask_;
+#endif
+      } else {
         input_[i] = prng.Rand8Extremes();
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = prng.Rand16() & mask_;
+#endif
+      }
     }
   }
 
   void SetConstantInput(int value) {
     memset(input_, value, kInputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_memset16(input16_, value, kInputBufferSize);
+#endif
+  }
+
+  void CopyOutputToRef() {
+    memcpy(output_ref_, output_, kOutputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+    memcpy(output16_ref_, output16_, kOutputBufferSize);
+#endif
   }
 
   void CheckGuardBlocks() {
@@ -240,39 +426,197 @@
     }
   }
 
-  uint8_t* input() const {
+  uint8_t *input() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
     return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
   }
 
-  uint8_t* output() const {
+  uint8_t *output() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
     return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+  }
+
+  uint8_t *output_ref() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
+    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return list[index];
+    } else {
+      return CONVERT_TO_SHORTPTR(list)[index];
+    }
+#else
+    return list[index];
+#endif
+  }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      list[index] = (uint8_t) val;
+    } else {
+      CONVERT_TO_SHORTPTR(list)[index] = val;
+    }
+#else
+    list[index] = (uint8_t) val;
+#endif
+  }
+
+  void wrapper_filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                          const unsigned int src_stride,
+                                          const int16_t *HFilter,
+                                          const int16_t *VFilter,
+                                          uint8_t *dst_ptr,
+                                          unsigned int dst_stride,
+                                          unsigned int output_width,
+                                          unsigned int output_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                                 dst_ptr, dst_stride, output_width,
+                                 output_height);
+    } else {
+      highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr),
+                                        src_stride, HFilter, VFilter,
+                                        CONVERT_TO_SHORTPTR(dst_ptr),
+                                        dst_stride, output_width, output_height,
+                                        UUT_->use_highbd_);
+    }
+#else
+    filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                               dst_ptr, dst_stride, output_width,
+                               output_height);
+#endif
+  }
+
+  void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
+                                  const unsigned int src_stride,
+                                  const int16_t *HFilter,
+                                  const int16_t *VFilter,
+                                  uint8_t *dst_ptr,
+                                  unsigned int dst_stride,
+                                  unsigned int output_width,
+                                  unsigned int output_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                         dst_ptr, dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                HFilter, VFilter,
+                                CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                output_width, output_height, UUT_->use_highbd_);
+    }
+#else
+    filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                       dst_ptr, dst_stride, output_width, output_height);
+#endif
   }
 
   const ConvolveFunctions* UUT_;
   static uint8_t* input_;
   static uint8_t* output_;
+  static uint8_t* output_ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+  static uint16_t* input16_;
+  static uint16_t* output16_;
+  static uint16_t* output16_ref_;
+  int mask_;
+#endif
 };
+
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
+uint8_t* ConvolveTest::output_ref_ = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+uint16_t* ConvolveTest::input16_ = NULL;
+uint16_t* ConvolveTest::output16_ = NULL;
+uint16_t* ConvolveTest::output16_ref_ = NULL;
+#endif
 
 TEST_P(ConvolveTest, GuardBlocks) {
   CheckGuardBlocks();
 }
 
+TEST_P(ConvolveTest, Copy) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                  Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, Avg) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  uint8_t* const out_ref = output_ref();
+  CopyOutputToRef();
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->avg_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) +
+                                   lookup(out_ref, y * kOutputStride + x), 1))
+          << "(" << x << "," << y << ")";
+}
+
 TEST_P(ConvolveTest, CopyHoriz) {
   uint8_t* const in = input();
   uint8_t* const out = output();
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
   ASM_REGISTER_STATE_CHECK(
-      UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
-                Width(), Height()));
+      UUT_->sh8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
+                 Width(), Height()));
 
   CheckGuardBlocks();
 
   for (int y = 0; y < Height(); ++y)
     for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
           << "(" << x << "," << y << ")";
 }
 
@@ -282,14 +626,15 @@
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
   ASM_REGISTER_STATE_CHECK(
-      UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
-                Width(), Height()));
+      UUT_->sv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
+                 Width(), Height()));
 
   CheckGuardBlocks();
 
   for (int y = 0; y < Height(); ++y)
     for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
           << "(" << x << "," << y << ")";
 }
 
@@ -299,14 +644,15 @@
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
   ASM_REGISTER_STATE_CHECK(
-      UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
-                 Width(), Height()));
+      UUT_->shv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8,
+                  16, Width(), Height()));
 
   CheckGuardBlocks();
 
   for (int y = 0; y < Height(); ++y)
     for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
           << "(" << x << "," << y << ")";
 }
 
@@ -316,7 +662,7 @@
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
     for (int i = 0; i < kNumFilters; i++) {
       const int p0 = filters[i][0] + filters[i][1];
       const int p1 = filters[i][2] + filters[i][3];
@@ -339,23 +685,31 @@
 TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
   uint8_t* const in = input();
   uint8_t* const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t* ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+#else
   uint8_t ref[kOutputStride * kMaxDimension];
-
+#endif
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
-    const InterpKernel *const eighttap_smooth =
-        vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-        filter_block2d_8_c(in, kInputStride,
-                           filters[filter_x], filters[filter_y],
-                           ref, kOutputStride,
-                           Width(), Height());
+        wrapper_filter_block2d_8_c(in, kInputStride,
+                                   filters[filter_x], filters[filter_y],
+                                   ref, kOutputStride,
+                                   Width(), Height());
 
-        if (filters == eighttap_smooth || (filter_x && filter_y))
+        if (filter_x && filter_y)
           ASM_REGISTER_STATE_CHECK(
               UUT_->hv8_(in, kInputStride, out, kOutputStride,
                          filters[filter_x], 16, filters[filter_y], 16,
@@ -365,17 +719,23 @@
               UUT_->v8_(in, kInputStride, out, kOutputStride,
                         kInvalidFilter, 16, filters[filter_y], 16,
                         Width(), Height()));
-        else
+        else if (filter_x)
           ASM_REGISTER_STATE_CHECK(
               UUT_->h8_(in, kInputStride, out, kOutputStride,
                         filters[filter_x], 16, kInvalidFilter, 16,
                         Width(), Height()));
+        else
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->copy_(in, kInputStride, out, kOutputStride,
+                          kInvalidFilter, 0, kInvalidFilter, 0,
+                          Width(), Height()));
 
         CheckGuardBlocks();
 
         for (int y = 0; y < Height(); ++y)
           for (int x = 0; x < Width(); ++x)
-            ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])
+            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                      lookup(out, y * kOutputStride + x))
                 << "mismatch at (" << x << "," << y << "), "
                 << "filters (" << filter_bank << ","
                 << filter_x << "," << filter_y << ")";
@@ -387,33 +747,51 @@
 TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {
   uint8_t* const in = input();
   uint8_t* const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t* ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+#else
   uint8_t ref[kOutputStride * kMaxDimension];
+#endif
 
   // Populate ref and out with some random data
   ::libvpx_test::ACMRandom prng;
   for (int y = 0; y < Height(); ++y) {
     for (int x = 0; x < Width(); ++x) {
-      const uint8_t r = prng.Rand8Extremes();
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
 
-      out[y * kOutputStride + x] = r;
-      ref[y * kOutputStride + x] = r;
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
     }
   }
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
-    const InterpKernel *const eighttap_smooth =
-        vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-        filter_average_block2d_8_c(in, kInputStride,
-                                   filters[filter_x], filters[filter_y],
-                                   ref, kOutputStride,
-                                   Width(), Height());
+        wrapper_filter_average_block2d_8_c(in, kInputStride,
+                                           filters[filter_x], filters[filter_y],
+                                           ref, kOutputStride,
+                                           Width(), Height());
 
-        if (filters == eighttap_smooth || (filter_x && filter_y))
+        if (filter_x && filter_y)
           ASM_REGISTER_STATE_CHECK(
               UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,
                              filters[filter_x], 16, filters[filter_y], 16,
@@ -421,19 +799,25 @@
         else if (filter_y)
           ASM_REGISTER_STATE_CHECK(
               UUT_->v8_avg_(in, kInputStride, out, kOutputStride,
-                            filters[filter_x], 16, filters[filter_y], 16,
+                            kInvalidFilter, 16, filters[filter_y], 16,
+                            Width(), Height()));
+        else if (filter_x)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->h8_avg_(in, kInputStride, out, kOutputStride,
+                            filters[filter_x], 16, kInvalidFilter, 16,
                             Width(), Height()));
         else
           ASM_REGISTER_STATE_CHECK(
-              UUT_->h8_avg_(in, kInputStride, out, kOutputStride,
-                            filters[filter_x], 16, filters[filter_y], 16,
-                            Width(), Height()));
+              UUT_->avg_(in, kInputStride, out, kOutputStride,
+                          kInvalidFilter, 0, kInvalidFilter, 0,
+                          Width(), Height()));
 
         CheckGuardBlocks();
 
         for (int y = 0; y < Height(); ++y)
           for (int x = 0; x < Width(); ++x)
-            ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])
+            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                      lookup(out, y * kOutputStride + x))
                 << "mismatch at (" << x << "," << y << "), "
                 << "filters (" << filter_bank << ","
                 << filter_x << "," << filter_y << ")";
@@ -442,109 +826,102 @@
   }
 }
 
-DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
-    { 0,   0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0, 128},
-    { 0,   0,   0, 128},
-    { 0,   0, 128},
-    { 0, 128},
-    { 128},
-    { 0,   0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0, 128},
-    { 0,   0,   0, 128},
-    { 0,   0, 128},
-    { 0, 128},
-    { 128}
-};
-
-/* This test exercises the horizontal and vertical filter functions. */
-TEST_P(ConvolveTest, ChangeFilterWorks) {
-  uint8_t* const in = input();
-  uint8_t* const out = output();
-
-  /* Assume that the first input sample is at the 8/16th position. */
-  const int kInitialSubPelOffset = 8;
-
-  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
-   * at position -3 with respect to the current filtering position. Since
-   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
-   * which is non-zero only in the last tap. So, applying the filter at the
-   * current input position will result in an output equal to the pixel at
-   * offset +4 (-3 + 7) with respect to the current filtering position.
-   */
-  const int kPixelSelected = 4;
-
-  /* Assume that each output pixel requires us to step on by 17/16th pixels in
-   * the input.
-   */
-  const int kInputPixelStep = 17;
-
-  /* The filters are setup in such a way that the expected output produces
-   * sets of 8 identical output samples. As the filter position moves to the
-   * next 1/16th pixel position the only active (=128) filter tap moves one
-   * position to the left, resulting in the same input pixel being replicated
-   * in to the output for 8 consecutive samples. After each set of 8 positions
-   * the filters select a different input pixel. kFilterPeriodAdjust below
-   * computes which input pixel is written to the output for a specified
-   * x or y position.
-   */
-
-  /* Test the horizontal filter. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->h8_(in, kInputStride, out, kOutputStride,
-                kChangeFilters[kInitialSubPelOffset],
-                kInputPixelStep, NULL, 0, Width(), Height()));
-
-  for (int x = 0; x < Width(); ++x) {
-    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
+TEST_P(ConvolveTest, FilterExtremes) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
   }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
 
-  /* Test the vertical filter. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->v8_(in, kInputStride, out, kOutputStride,
-                NULL, 0, kChangeFilters[kInitialSubPelOffset],
-                kInputPixelStep, Width(), Height()));
-
+  // Populate ref and out with some random data
+  ::libvpx_test::ACMRandom prng;
   for (int y = 0; y < Height(); ++y) {
-    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
-  }
-
-  /* Test the horizontal and vertical filters in combination. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
-                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
-                 Width(), Height()));
-
-  for (int y = 0; y < Height(); ++y) {
-    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjustY * kInputPixelStep)
-                          >> SUBPEL_BITS);
     for (int x = 0; x < Width(); ++x) {
-      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x =
-          kPixelSelected + ((kInitialSubPelOffset
-              + kFilterPeriodAdjustX * kInputPixelStep)
-                            >> SUBPEL_BITS);
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
 
-      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
-          << "x == " << x << ", y == " << y;
+  for (int axis = 0; axis < 2; axis++) {
+    int seed_val = 0;
+    while (seed_val < 256) {
+      for (int y = 0; y < 8; ++y) {
+        for (int x = 0; x < 8; ++x) {
+#if CONFIG_VP9_HIGHBITDEPTH
+            assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                       ((seed_val >> (axis ? y : x)) & 1) * mask_);
+#else
+            assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                       ((seed_val >> (axis ? y : x)) & 1) * 255);
+#endif
+          if (axis) seed_val++;
+        }
+        if (axis)
+          seed_val-= 8;
+        else
+          seed_val++;
+      }
+      if (axis) seed_val += 8;
+
+      for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+        const InterpKernel *filters =
+            vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+        for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+          for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+            wrapper_filter_block2d_8_c(in, kInputStride,
+                                       filters[filter_x], filters[filter_y],
+                                       ref, kOutputStride,
+                                       Width(), Height());
+            if (filter_x && filter_y)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                             filters[filter_x], 16, filters[filter_y], 16,
+                             Width(), Height()));
+            else if (filter_y)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_(in, kInputStride, out, kOutputStride,
+                            kInvalidFilter, 16, filters[filter_y], 16,
+                            Width(), Height()));
+            else if (filter_x)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_(in, kInputStride, out, kOutputStride,
+                            filters[filter_x], 16, kInvalidFilter, 16,
+                            Width(), Height()));
+            else
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->copy_(in, kInputStride, out, kOutputStride,
+                              kInvalidFilter, 0, kInvalidFilter, 0,
+                              Width(), Height()));
+
+            for (int y = 0; y < Height(); ++y)
+              for (int x = 0; x < Width(); ++x)
+                ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                          lookup(out, y * kOutputStride + x))
+                    << "mismatch at (" << x << "," << y << "), "
+                    << "filters (" << filter_bank << ","
+                    << filter_x << "," << filter_y << ")";
+          }
+        }
+      }
     }
   }
 }
@@ -554,23 +931,24 @@
 TEST_P(ConvolveTest, CheckScalingFiltering) {
   uint8_t* const in = input();
   uint8_t* const out = output();
-  const InterpKernel *const eighttap = vp9_get_interp_kernel(EIGHTTAP);
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
 
   SetConstantInput(127);
 
   for (int frac = 0; frac < 16; ++frac) {
     for (int step = 1; step <= 32; ++step) {
       /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                          eighttap[frac], step,
-                                          eighttap[frac], step,
-                                          Width(), Height()));
+      ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
+                                           eighttap[frac], step,
+                                           eighttap[frac], step,
+                                           Width(), Height()));
 
       CheckGuardBlocks();
 
       for (int y = 0; y < Height(); ++y) {
         for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+          ASSERT_EQ(lookup(in, y * kInputStride + x),
+                    lookup(out, y * kOutputStride + x))
               << "x == " << x << ", y == " << y
               << ", frac == " << frac << ", step == " << step;
         }
@@ -581,10 +959,590 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE2 && ARCH_X86_64
+void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,
+                                 int filter_x_stride,
+                                 const int16_t *filter_y,
+                                 int filter_y_stride,
+                                 int w, int h) {
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+                                  filter_x_stride, filter_y, filter_y_stride,
+                                  w, h, 8);
+}
+
+void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x,
+                                     int filter_x_stride,
+                                     const int16_t *filter_y,
+                                     int filter_y_stride,
+                                     int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x,
+                                int filter_x_stride,
+                                const int16_t *filter_y,
+                                int filter_y_stride,
+                                int w, int h) {
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x,
+                                    int filter_x_stride,
+                                    const int16_t *filter_y,
+                                    int filter_y_stride,
+                                    int w, int h) {
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x,
+                           int filter_x_stride,
+                           const int16_t *filter_y,
+                           int filter_y_stride,
+                           int w, int h) {
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x,
+                               int filter_x_stride,
+                               const int16_t *filter_y,
+                               int filter_y_stride,
+                               int w, int h) {
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x,
+                                  int filter_x_stride,
+                                  const int16_t *filter_y,
+                                  int filter_y_stride,
+                                  int w, int h) {
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x,
+                                      int filter_x_stride,
+                                      const int16_t *filter_y,
+                                      int filter_y_stride,
+                                      int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,
+                                 int filter_x_stride,
+                                 const int16_t *filter_y,
+                                 int filter_y_stride,
+                                 int w, int h) {
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x,
+                                     int filter_x_stride,
+                                     const int16_t *filter_y,
+                                     int filter_y_stride,
+                                     int w, int h) {
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x,
+                                int filter_x_stride,
+                                const int16_t *filter_y,
+                                int filter_y_stride,
+                                int w, int h) {
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x,
+                                  int filter_x_stride,
+                                  const int16_t *filter_y,
+                                  int filter_y_stride,
+                                  int w, int h) {
+  vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x,
+                                      int filter_x_stride,
+                                      const int16_t *filter_y,
+                                      int filter_y_stride,
+                                      int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_x_stride,
+                                      filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,
+                                 int filter_x_stride,
+                                 const int16_t *filter_y,
+                                 int filter_y_stride,
+                                 int w, int h) {
+  vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+                                 filter_x, filter_x_stride,
+                                 filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x,
+                                     int filter_x_stride,
+                                     const int16_t *filter_y,
+                                     int filter_y_stride,
+                                     int w, int h) {
+  vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+                                     filter_x, filter_x_stride,
+                                     filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x,
+                                int filter_x_stride,
+                                const int16_t *filter_y,
+                                int filter_y_stride,
+                                int w, int h) {
+  vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+                                filter_x, filter_x_stride,
+                                filter_y, filter_y_stride, w, h, 12);
+}
+#endif  // HAVE_SSE2 && ARCH_X86_64
+
+void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x,
+                           int filter_x_stride,
+                           const int16_t *filter_y,
+                           int filter_y_stride,
+                           int w, int h) {
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x,
+                              int filter_x_stride,
+                              const int16_t *filter_y,
+                              int filter_y_stride,
+                              int w, int h) {
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x,
+                                  int filter_x_stride,
+                                  const int16_t *filter_y,
+                                  int filter_y_stride,
+                                  int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,
+                             int filter_x_stride,
+                             const int16_t *filter_y,
+                             int filter_y_stride,
+                             int w, int h) {
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,
+                                 int filter_x_stride,
+                                 const int16_t *filter_y,
+                                 int filter_y_stride,
+                                 int w, int h) {
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x,
+                        int filter_x_stride,
+                        const int16_t *filter_y,
+                        int filter_y_stride,
+                        int w, int h) {
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 8);
+}
+
+void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,
+                             int filter_x_stride,
+                             const int16_t *filter_y,
+                             int filter_y_stride,
+                             int w, int h) {
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x,
+                               int filter_x_stride,
+                               const int16_t *filter_y,
+                               int filter_y_stride,
+                               int w, int h) {
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x,
+                                   int filter_x_stride,
+                                   const int16_t *filter_y,
+                                   int filter_y_stride,
+                                   int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x,
+                              int filter_x_stride,
+                              const int16_t *filter_y,
+                              int filter_y_stride,
+                              int w, int h) {
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x,
+                                  int filter_x_stride,
+                                  const int16_t *filter_y,
+                                  int filter_y_stride,
+                                  int w, int h) {
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x,
+                         int filter_x_stride,
+                         const int16_t *filter_y,
+                         int filter_y_stride,
+                         int w, int h) {
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,
+                             int filter_x_stride,
+                             const int16_t *filter_y,
+                             int filter_y_stride,
+                             int w, int h) {
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 10);
+}
+
+void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,
+                             int filter_x_stride,
+                             const int16_t *filter_y,
+                             int filter_y_stride,
+                             int w, int h) {
+  vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x,
+                            int filter_x_stride,
+                            const int16_t *filter_y,
+                            int filter_y_stride,
+                            int w, int h) {
+  vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, filter_x_stride,
+                            filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x,
+                               int filter_x_stride,
+                               const int16_t *filter_y,
+                               int filter_y_stride,
+                               int w, int h) {
+  vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                               filter_x, filter_x_stride,
+                               filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x,
+                                   int filter_x_stride,
+                                   const int16_t *filter_y,
+                                   int filter_y_stride,
+                                   int w, int h) {
+  vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_x_stride,
+                                   filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x,
+                              int filter_x_stride,
+                              const int16_t *filter_y,
+                              int filter_y_stride,
+                              int w, int h) {
+  vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                              filter_x, filter_x_stride,
+                              filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x,
+                                  int filter_x_stride,
+                                  const int16_t *filter_y,
+                                  int filter_y_stride,
+                                  int w, int h) {
+  vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_x_stride,
+                                  filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x,
+                         int filter_x_stride,
+                         const int16_t *filter_y,
+                         int filter_y_stride,
+                         int w, int h) {
+  vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+                         filter_x, filter_x_stride,
+                         filter_y, filter_y_stride, w, h, 12);
+}
+
+void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,
+                             int filter_x_stride,
+                             const int16_t *filter_y,
+                             int filter_y_stride,
+                             int w, int h) {
+  vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                             filter_x, filter_x_stride,
+                             filter_y, filter_y_stride, w, h, 12);
+}
+
 const ConvolveFunctions convolve8_c(
-    vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
-    vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
-    vp9_convolve8_c, vp9_convolve8_avg_c);
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
+    wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
+    wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_c),
+    make_tuple(8, 4, &convolve8_c),
+    make_tuple(4, 8, &convolve8_c),
+    make_tuple(8, 8, &convolve8_c),
+    make_tuple(16, 8, &convolve8_c),
+    make_tuple(8, 16, &convolve8_c),
+    make_tuple(16, 16, &convolve8_c),
+    make_tuple(32, 16, &convolve8_c),
+    make_tuple(16, 32, &convolve8_c),
+    make_tuple(32, 32, &convolve8_c),
+    make_tuple(64, 32, &convolve8_c),
+    make_tuple(32, 64, &convolve8_c),
+    make_tuple(64, 64, &convolve8_c)));
+const ConvolveFunctions convolve10_c(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
+    wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
+    wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
+INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve10_c),
+    make_tuple(8, 4, &convolve10_c),
+    make_tuple(4, 8, &convolve10_c),
+    make_tuple(8, 8, &convolve10_c),
+    make_tuple(16, 8, &convolve10_c),
+    make_tuple(8, 16, &convolve10_c),
+    make_tuple(16, 16, &convolve10_c),
+    make_tuple(32, 16, &convolve10_c),
+    make_tuple(16, 32, &convolve10_c),
+    make_tuple(32, 32, &convolve10_c),
+    make_tuple(64, 32, &convolve10_c),
+    make_tuple(32, 64, &convolve10_c),
+    make_tuple(64, 64, &convolve10_c)));
+const ConvolveFunctions convolve12_c(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
+    wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
+    wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
+INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve12_c),
+    make_tuple(8, 4, &convolve12_c),
+    make_tuple(4, 8, &convolve12_c),
+    make_tuple(8, 8, &convolve12_c),
+    make_tuple(16, 8, &convolve12_c),
+    make_tuple(8, 16, &convolve12_c),
+    make_tuple(16, 16, &convolve12_c),
+    make_tuple(32, 16, &convolve12_c),
+    make_tuple(16, 32, &convolve12_c),
+    make_tuple(32, 32, &convolve12_c),
+    make_tuple(64, 32, &convolve12_c),
+    make_tuple(32, 64, &convolve12_c),
+    make_tuple(64, 64, &convolve12_c)));
+
+#else
+
+const ConvolveFunctions convolve8_c(
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
+    vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c,
+    vpx_convolve8_c, vpx_convolve8_avg_c,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_c),
@@ -600,12 +1558,87 @@
     make_tuple(64, 32, &convolve8_c),
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
+#endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
-    vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
-    vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
-    vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
+    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
+    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
+    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
+    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
+    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
+const ConvolveFunctions convolve10_sse2(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
+    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
+    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
+    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
+    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
+    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
+const ConvolveFunctions convolve12_sse2(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
+    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
+    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
+    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
+    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
+    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_sse2),
+    make_tuple(8, 4, &convolve8_sse2),
+    make_tuple(4, 8, &convolve8_sse2),
+    make_tuple(8, 8, &convolve8_sse2),
+    make_tuple(16, 8, &convolve8_sse2),
+    make_tuple(8, 16, &convolve8_sse2),
+    make_tuple(16, 16, &convolve8_sse2),
+    make_tuple(32, 16, &convolve8_sse2),
+    make_tuple(16, 32, &convolve8_sse2),
+    make_tuple(32, 32, &convolve8_sse2),
+    make_tuple(64, 32, &convolve8_sse2),
+    make_tuple(32, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2),
+    make_tuple(4, 4, &convolve10_sse2),
+    make_tuple(8, 4, &convolve10_sse2),
+    make_tuple(4, 8, &convolve10_sse2),
+    make_tuple(8, 8, &convolve10_sse2),
+    make_tuple(16, 8, &convolve10_sse2),
+    make_tuple(8, 16, &convolve10_sse2),
+    make_tuple(16, 16, &convolve10_sse2),
+    make_tuple(32, 16, &convolve10_sse2),
+    make_tuple(16, 32, &convolve10_sse2),
+    make_tuple(32, 32, &convolve10_sse2),
+    make_tuple(64, 32, &convolve10_sse2),
+    make_tuple(32, 64, &convolve10_sse2),
+    make_tuple(64, 64, &convolve10_sse2),
+    make_tuple(4, 4, &convolve12_sse2),
+    make_tuple(8, 4, &convolve12_sse2),
+    make_tuple(4, 8, &convolve12_sse2),
+    make_tuple(8, 8, &convolve12_sse2),
+    make_tuple(16, 8, &convolve12_sse2),
+    make_tuple(8, 16, &convolve12_sse2),
+    make_tuple(16, 16, &convolve12_sse2),
+    make_tuple(32, 16, &convolve12_sse2),
+    make_tuple(16, 32, &convolve12_sse2),
+    make_tuple(32, 32, &convolve12_sse2),
+    make_tuple(64, 32, &convolve12_sse2),
+    make_tuple(32, 64, &convolve12_sse2),
+    make_tuple(64, 64, &convolve12_sse2)));
+#else
+const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+    vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
+#else
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+#endif  // CONFIG_USE_X86INC
+    vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
+    vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
+    vpx_convolve8_sse2, vpx_convolve8_avg_sse2,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_sse2),
@@ -621,13 +1654,18 @@
     make_tuple(64, 32, &convolve8_sse2),
     make_tuple(32, 64, &convolve8_sse2),
     make_tuple(64, 64, &convolve8_sse2)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3);
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_ssse3),
@@ -645,40 +1683,22 @@
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
 
-#if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h);
-void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h);
-}
-
+#if HAVE_AVX2 && HAVE_SSSE3
 const ConvolveFunctions convolve8_avx2(
-    vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
-    vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
-    vp9_convolve8_avx2, vp9_convolve8_avg_ssse3);
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_avx2),
     make_tuple(8, 4, &convolve8_avx2),
     make_tuple(4, 8, &convolve8_avx2),
     make_tuple(8, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2)));
-
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
+    make_tuple(8, 16, &convolve8_avx2),
     make_tuple(16, 8, &convolve8_avx2),
     make_tuple(16, 16, &convolve8_avx2),
     make_tuple(32, 16, &convolve8_avx2),
@@ -687,13 +1707,28 @@
     make_tuple(64, 32, &convolve8_avx2),
     make_tuple(32, 64, &convolve8_avx2),
     make_tuple(64, 64, &convolve8_avx2)));
-#endif
+#endif  // HAVE_AVX2 && HAVE_SSSE3
 
+#if HAVE_NEON
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
-    vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
-    vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_neon, vp9_convolve8_avg_neon);
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+#else  // HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+#endif  // HAVE_NEON_ASM
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_neon),
@@ -709,13 +1744,17 @@
     make_tuple(64, 32, &convolve8_neon),
     make_tuple(32, 64, &convolve8_neon),
     make_tuple(64, 64, &convolve8_neon)));
-#endif
+#endif  // HAVE_NEON
 
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
-    vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
-    vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
-    vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2);
+    vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
+    vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
+    vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2,
+    vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_dspr2),
@@ -732,4 +1771,30 @@
     make_tuple(32, 64, &convolve8_dspr2),
     make_tuple(64, 64, &convolve8_dspr2)));
 #endif
+
+#if HAVE_MSA
+const ConvolveFunctions convolve8_msa(
+    vpx_convolve_copy_msa, vpx_convolve_avg_msa,
+    vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
+    vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa,
+    vpx_convolve8_msa, vpx_convolve8_avg_msa,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_msa),
+    make_tuple(8, 4, &convolve8_msa),
+    make_tuple(4, 8, &convolve8_msa),
+    make_tuple(8, 8, &convolve8_msa),
+    make_tuple(16, 8, &convolve8_msa),
+    make_tuple(8, 16, &convolve8_msa),
+    make_tuple(16, 16, &convolve8_msa),
+    make_tuple(32, 16, &convolve8_msa),
+    make_tuple(16, 32, &convolve8_msa),
+    make_tuple(32, 32, &convolve8_msa),
+    make_tuple(64, 32, &convolve8_msa),
+    make_tuple(32, 64, &convolve8_msa),
+    make_tuple(64, 64, &convolve8_msa)));
+#endif  // HAVE_MSA
 }  // namespace

diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc
index 4477bf0..8baa2f9 100644
--- a/libvpx/test/cpu_speed_test.cc
+++ b/libvpx/test/cpu_speed_test.cc

@@ -140,4 +140,9 @@
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
                       ::libvpx_test::kRealTime),
     ::testing::Range(0, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    CpuSpeedTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 3));
 }  // namespace

diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index 8dcf26c..b6cae79 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc

@@ -14,6 +14,7 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
 
 namespace {
 
@@ -38,10 +39,25 @@
     first_drop_ = 0;
     bits_total_ = 0;
     duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0)
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
@@ -120,9 +136,67 @@
   double file_datarate_;
   double effective_datarate_;
   size_t bits_in_last_frame_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
 };
 
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
+    // refers to the blur thresholds: 20, 40, 60 80.
+    // The j = 0 case (denoiser off) is covered in the tests below.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+      << " The datarate for the file missed the target!";
+}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
 TEST_P(DatarateTestLarge, BasicBufferModel) {
+  denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_dropframe_thresh = 1;
   cfg_.rc_max_quantizer = 56;
@@ -154,6 +228,7 @@
 }
 
 TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_max_quantizer = 36;
   cfg_.rc_end_usage = VPX_CBR;
@@ -203,10 +278,14 @@
     tot_frame_number_ = 0;
     first_drop_ = 0;
     num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
     // For testing up to 3 layers.
     for (int i = 0; i < 3; ++i) {
       bits_total_[i] = 0;
     }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
   }
 
   //
@@ -274,21 +353,30 @@
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0)
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
     }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
     if (cfg_.ts_number_layers > 1) {
-      if (video->frame() == 1) {
+      if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
       }
-      vpx_svc_layer_id_t layer_id = {0, 0};
+      vpx_svc_layer_id_t layer_id;
       layer_id.spatial_layer_id = 0;
       frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
       layer_id.temporal_layer_id = SetLayerId(video->frame(),
                                               cfg_.ts_number_layers);
-      if (video->frame() > 0) {
-       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
-      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -357,6 +445,9 @@
   int64_t bits_in_buffer_model_;
   vpx_codec_pts_t first_drop_;
   int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
 };
 
 // Check basic rate targeting,
@@ -447,7 +538,7 @@
         << " The first dropped frame for drop_thresh " << i
         << " > first dropped frame for drop_thresh "
         << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops)
+    ASSERT_GE(num_drops_, last_num_drops * 0.90)
         << " The number of dropped frames for drop_thresh " << i
         << " < number of dropped frames for drop_thresh "
         << i - kDropFrameThreshTestStep;
@@ -473,20 +564,25 @@
   cfg_.ts_rate_decimator[0] = 2;
   cfg_.ts_rate_decimator[1] = 1;
 
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  if (deadline_ == VPX_DL_REALTIME)
+    cfg_.g_error_resilient = 1;
+
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
   for (int i = 200; i <= 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     // 60-40 bitrate allocation for 2 temporal layers.
-    cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
           << " The datarate for the file is lower than target by too much, "
               "for layer: " << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
           << " The datarate for the file is greater than target by too much, "
               "for layer: " << j;
     }
@@ -511,25 +607,27 @@
   cfg_.ts_rate_decimator[1] = 2;
   cfg_.ts_rate_decimator[2] = 1;
 
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
   for (int i = 200; i <= 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     // 40-20-40 bitrate allocation for 3 temporal layers.
-    cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
       // TODO(yaowu): Work out more stable rc control strategy and
       //              Adjust the thresholds to be tighter than .75.
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
           << " The datarate for the file is lower than target by too much, "
               "for layer: " << j;
       // TODO(yaowu): Work out more stable rc control strategy and
       //              Adjust the thresholds to be tighter than 1.25.
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
           << " The datarate for the file is greater than target by too much, "
               "for layer: " << j;
     }
@@ -557,20 +655,22 @@
   cfg_.ts_rate_decimator[1] = 2;
   cfg_.ts_rate_decimator[2] = 1;
 
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
   cfg_.rc_target_bitrate = 200;
   ResetModel();
   // 40-20-40 bitrate allocation for 3 temporal layers.
-  cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-  cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-    ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
         << " The datarate for the file is lower than target by too much, "
             "for layer: " << j;
-    ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
         << " The datarate for the file is greater than target by too much, "
             "for layer: " << j;
     // Expect some frame drops in this test: for this 200 frames test,
@@ -580,9 +680,299 @@
   }
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, when denoiser is on.
+TEST_P(DatarateTestVP9Large, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~DatarateOnePassCbrSvc() {}
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    speed_setting_ = GET_PARAM(2);
+    ResetModel();
+  }
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+  }
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+  }
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      int i;
+      for (i = 0; i < 2; ++i) {
+        svc_params_.max_quantizers[i] = 63;
+        svc_params_.min_quantizers[i] = 0;
+      }
+      svc_params_.scaling_factor_num[0] = 144;
+      svc_params_.scaling_factor_den[0] = 288;
+      svc_params_.scaling_factor_num[1] = 288;
+      svc_params_.scaling_factor_den[1] = 288;
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (last_pts_ == 0)
+      duration = 1;
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY)
+                         ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    bits_in_buffer_model_ -= frame_size_in_bits;
+    bits_total_ += frame_size_in_bits;
+    if (!first_drop_ && duration > 1)
+      first_drop_ = last_pts_ + 1;
+    last_pts_ = pkt->data.frame.pts;
+    bits_in_last_frame_ = frame_size_in_bits;
+    ++frame_number_;
+  }
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+      duration_ = (last_pts_ + 1) * timebase_;
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
+          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  virtual void MismatchHook(const vpx_image_t *img1,
+                            const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() {
+    return mismatch_nframes_;
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  size_t bits_in_last_frame_;
+  vpx_svc_extra_cfg_t svc_params_;
+  int speed_setting_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
+};
+static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
+    const vpx_svc_extra_cfg_t *svc_params,
+    int spatial_layers,
+    int temporal_layers,
+    int temporal_layering_mode,
+    unsigned int total_rate) {
+  int sl, spatial_layer_target;
+  float total = 0;
+  float alloc_ratio[VPX_MAX_LAYERS] = {0};
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    if (svc_params->scaling_factor_den[sl] > 0) {
+      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] *
+          1.0 / svc_params->scaling_factor_den[sl]);
+      total += alloc_ratio[sl];
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
+        (unsigned int)(enc_cfg->rc_target_bitrate *
+            alloc_ratio[sl] / total);
+    const int index = sl * temporal_layers;
+    if (temporal_layering_mode == 3) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target >> 1;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+      enc_cfg->layer_target_bitrate[index + 2] =
+          spatial_layer_target;
+    } else if (temporal_layering_mode == 2) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target * 2 / 3;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          spatial_layer_target;
+    }
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  // TODO(wonkap/marpan): No frame drop for now, we need to implement correct
+  // frame dropping for SVC.
+  cfg_.rc_dropframe_thresh = 0;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
+  for (int i = 400; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+        cfg_.rc_target_bitrate);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  // TODO(wonkap/marpan): No frame drop for now, we need to implement correct
+  // frame dropping for SVC.
+  cfg_.rc_dropframe_thresh = 0;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+      cfg_.rc_target_bitrate);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+}
+
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
-                          ::libvpx_test::kRealTime),
+                                            ::libvpx_test::kRealTime),
                           ::testing::Range(2, 7));
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 8));
 }  // namespace

diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index ee417ce..e9de76a 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc

@@ -13,18 +13,18 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
-
-extern "C" {
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
@@ -258,40 +258,100 @@
   }
 }
 
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Idct16x16Param;
 
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_fdct16x16_c(in, out, stride);
+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                   int /*tx_type*/) {
+  vpx_fdct16x16_c(in, out, stride);
 }
 
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
-  vp9_idct16x16_256_add_c(in, dest, stride);
+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                   int /*tx_type*/) {
+  vpx_idct16x16_256_add_c(in, dest, stride);
 }
 
-void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                  int tx_type) {
   vp9_fht16x16_c(in, out, stride, tx_type);
 }
 
-void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
   vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+}
+
+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+}
+
+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_10(in, out, stride);
+}
+
+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_12(in, out, stride);
+}
+
+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+}
+
+void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+}
+
+void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 class Trans16x16TestBase {
  public:
   virtual ~Trans16x16TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
 
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunAccuracyCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,24 +359,49 @@
     int64_t total_error = 0;
     const int count_test_block = 10000;
     for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
 
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                           test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -324,24 +409,24 @@
       }
     }
 
-    EXPECT_GE(1u, max_error)
+    EXPECT_GE(1u  << 2 * (bit_depth_ - 8), max_error)
         << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
 
-    EXPECT_GE(count_test_block , total_error)
+    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
         << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
   }
 
   void RunCoeffCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -355,23 +440,21 @@
   void RunMemCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -381,7 +464,7 @@
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
         EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
             << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
       }
     }
@@ -389,71 +472,116 @@
 
   void RunQuantCheck(int dc_thred, int ac_thred) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    const int count_test_block = 100000;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
 
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0)
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       if (i == 1)
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
 
       // clear reconstructed pixel buffers
-      vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
-      vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+      memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
+      memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+#if CONFIG_VP9_HIGHBITDEPTH
+      memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+      memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+#endif
 
       // quantization with maximum allowed step sizes
       output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
       for (int j = 1; j < kNumCoeffs; ++j)
         output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
-      inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
-
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(ref[j], dst[j]);
+      if (bit_depth_ == VPX_BITS_8) {
+        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+                     tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref[j], dst[j]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref16[j], dst16[j]);
+#endif
+      }
     }
   }
 
   void RunInvAccuracyCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     for (int i = 0; i < count_test_block; ++i) {
       double out_r[kNumCoeffs];
 
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
       }
 
       reference_16x16_dct_2d(in, out_r);
       for (int j = 0; j < kNumCoeffs; ++j)
-        coeff[j] = round(out_r[j]);
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
 
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            16));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         const uint32_t error = diff * diff;
         EXPECT_GE(1u, error)
             << "Error: 16x16 IDCT has error " << error
@@ -461,8 +589,68 @@
       }
     }
   }
+
+  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                 pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 16x16 IDCT Comparison has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
   int pitch_;
   int tx_type_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
   FhtFunc fwd_txfm_ref;
   IhtFunc inv_txfm_ref;
 };
@@ -477,17 +665,34 @@
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
     pitch_    = 16;
     fwd_txfm_ref = fdct16x16_ref;
     inv_txfm_ref = idct16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = idct16x16_10_ref;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = idct16x16_12_ref;
+        break;
+      default:
+        inv_txfm_ref = idct16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = idct16x16_ref;
+#endif
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -527,17 +732,34 @@
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
     pitch_    = 16;
     fwd_txfm_ref = fht16x16_ref;
     inv_txfm_ref = iht16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = iht16x16_10;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = iht16x16_12;
+        break;
+      default:
+        inv_txfm_ref = iht16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = iht16x16_ref;
+#endif
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -560,50 +782,159 @@
 TEST_P(Trans16x16HT, QuantCheck) {
   // The encoder skips any non-DC intra prediction modes,
   // when the quantization step size goes beyond 988.
-  RunQuantCheck(549, 988);
+  RunQuantCheck(429, 729);
+}
+
+class InvTrans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Idct16x16Param> {
+ public:
+  virtual ~InvTrans16x16DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_ = 16;
+    mask_ = (1 << bit_depth_) - 1;
+}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans16x16DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
 }
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_neon, 0)));
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_sse2,
-                   &vp9_idct16x16_256_add_sse2, 0)));
+        make_tuple(&vpx_fdct16x16_sse2,
+                   &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
-#endif
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, Trans16x16DCT,
+    SSE2, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
-#endif
+        make_tuple(&vpx_highbd_fdct16x16_sse2,
+                   &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c,
+                   &idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_sse2,
+                   &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct16x16_c,
+                   &idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_sse2,
+                   &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
+                   VPX_BITS_8)));
+// Optimizations take effect at a threshold of 3155, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans16x16DCT,
+    ::testing::Values(
+        make_tuple(&idct16x16_10_add_10_c,
+                   &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10,
+                   &idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10_add_12_c,
+                   &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
+        make_tuple(&idct16x16_12,
+                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_msa,
+                   &vpx_idct16x16_256_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc
index 4f34a44..f7327b1 100644
--- a/libvpx/test/dct32x32_test.cc
+++ b/libvpx/test/dct32x32_test.cc

@@ -13,15 +13,18 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
@@ -37,7 +40,7 @@
 
 const int kNumCoeffs = 1024;
 const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -55,7 +58,7 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = input[j*32 + i];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 32; ++j)
       output[j * 32 + i] = temp_out[j];
   }
@@ -64,17 +67,32 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = output[j + i*32];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 32; ++j)
       output[j + i * 32] = temp_out[j] / 4;
   }
 }
 
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int> Trans32x32Param;
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+    Trans32x32Param;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 8);
+}
+
+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+}
+
+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
  public:
@@ -84,12 +102,16 @@
     inv_txfm_ = GET_PARAM(1);
     version_  = GET_PARAM(2);  // 0: high precision forward transform
                                // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
   FwdTxfmFunc fwd_txfm_;
   InvTxfmFunc inv_txfm_;
 };
@@ -98,25 +120,49 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   uint32_t max_error = 0;
   int64_t total_error = 0;
-  const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+  const int count_test_block = 10000;
+  DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
 
   for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
     for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      test_input_block[j] = src[j] - dst[j];
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        test_input_block[j] = src16[j] - dst16[j];
+#endif
+      }
     }
 
     ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,
+                                         CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
 
     for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint32_t diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
       const uint32_t diff = dst[j] - src[j];
+#endif
       const uint32_t error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -129,10 +175,10 @@
     total_error /= 45;
   }
 
-  EXPECT_GE(1u, max_error)
+  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
       << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
 
-  EXPECT_GE(count_test_block, total_error)
+  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
       << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
 }
 
@@ -140,16 +186,16 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
 
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
   for (int i = 0; i < count_test_block; ++i) {
     for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
     const int stride = 32;
-    vp9_fdct32x32_c(input_block, output_ref_block, stride);
+    vpx_fdct32x32_c(input_block, output_ref_block, stride);
     ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
 
     if (version_ == 0) {
@@ -168,27 +214,25 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 2000;
 
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
   for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
     for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
     }
     if (i == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = 255;
+        input_extreme_block[j] = mask_;
     } else if (i == 1) {
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = -255;
+        input_extreme_block[j] = -mask_;
     }
 
     const int stride = 32;
-    vp9_fdct32x32_c(input_extreme_block, output_ref_block, stride);
+    vpx_fdct32x32_c(input_extreme_block, output_ref_block, stride);
     ASM_REGISTER_STATE_CHECK(
         fwd_txfm_(input_extreme_block, output_block, stride));
 
@@ -201,9 +245,9 @@
         EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
             << "Error: 32x32 FDCT rd has mismatched coefficients";
       }
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
           << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
           << "Error: 32x32 FDCT has coefficient larger than "
           << "4*DCT_MAX_VALUE";
     }
@@ -213,27 +257,50 @@
 TEST_P(Trans32x32Test, InverseAccuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
 
   for (int i = 0; i < count_test_block; ++i) {
     double out_r[kNumCoeffs];
 
     // Initialize a test block with input range [-255, 255]
     for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      in[j] = src[j] - dst[j];
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        in[j] = src16[j] - dst16[j];
+#endif
+      }
     }
 
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < kNumCoeffs; ++j)
-      coeff[j] = round(out_r[j]);
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
     for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
       const int diff = dst[j] - src[j];
+#endif
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 32x32 IDCT has error " << error
@@ -244,39 +311,85 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
-        make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
+        make_tuple(&vpx_highbd_fdct32x32_c,
+                   &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_rd_c,
+                   &idct32x32_10, 1, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_c,
+                   &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct32x32_rd_c,
+                   &idct32x32_12, 1, VPX_BITS_12),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_neon, 0),
-        make_tuple(&vp9_fdct32x32_rd_c,
-                   &vp9_idct32x32_1024_add_neon, 1)));
-#endif
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
-        make_tuple(&vp9_fdct32x32_rd_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
-#endif
+        make_tuple(&vpx_fdct32x32_sse2,
+                   &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_sse2,
+                   &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_AVX2
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
+                   VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
+                   VPX_BITS_12),
+        make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_c, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
-        make_tuple(&vp9_fdct32x32_rd_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
-#endif
+        make_tuple(&vpx_fdct32x32_avx2,
+                   &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_avx2,
+                   &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_msa,
+                   &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_msa,
+                   &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/libvpx/test/decode_api_test.cc b/libvpx/test/decode_api_test.cc
index 2837f8c..318351b 100644
--- a/libvpx/test/decode_api_test.cc
+++ b/libvpx/test/decode_api_test.cc

@@ -7,10 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "test/ivf_video_source.h"
 #include "./vpx_config.h"
+#include "test/ivf_video_source.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
@@ -26,6 +27,9 @@
 #if CONFIG_VP9_DECODER
     &vpx_codec_vp9_dx_algo,
 #endif
+#if CONFIG_VP10_DECODER
+    &vpx_codec_vp10_dx_algo,
+#endif
   };
   uint8_t buf[1] = {0};
   vpx_codec_ctx_t dec;
@@ -57,6 +61,21 @@
   }
 }
 
+#if CONFIG_VP8_DECODER
+TEST(DecodeAPI, OptionalParams) {
+  vpx_codec_ctx_t dec;
+
+#if CONFIG_ERROR_CONCEALMENT
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
+                                             VPX_CODEC_USE_ERROR_CONCEALMENT));
+#else
+  EXPECT_EQ(VPX_CODEC_INCAPABLE,
+            vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
+                               VPX_CODEC_USE_ERROR_CONCEALMENT));
+#endif  // CONFIG_ERROR_CONCEALMENT
+}
+#endif  // CONFIG_VP8_DECODER
+
 #if CONFIG_VP9_DECODER
 // Test VP9 codec controls after a decode error to ensure the code doesn't
 // misbehave.
@@ -65,6 +84,7 @@
     VP8D_GET_LAST_REF_UPDATES,
     VP8D_GET_FRAME_CORRUPTED,
     VP9D_GET_DISPLAY_SIZE,
+    VP9D_GET_FRAME_SIZE
   };
   int val[2];
 
@@ -113,8 +133,13 @@
   vpx_codec_ctx_t dec;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
   const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
+#if CONFIG_VP9_HIGHBITDEPTH
   EXPECT_EQ(VPX_CODEC_MEM_ERROR,
             vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#else
+  EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#endif
   vpx_codec_iter_t iter = NULL;
   EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
 

diff --git a/libvpx/test/decode_perf_test.cc b/libvpx/test/decode_perf_test.cc
index 11529b3..c24d517 100644
--- a/libvpx/test/decode_perf_test.cc
+++ b/libvpx/test/decode_perf_test.cc

@@ -8,13 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <string>
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
 #include "test/ivf_video_source.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/webm_video_source.h"
 #include "vpx_ports/vpx_timer.h"
+#include "./ivfenc.h"
 #include "./vpx_version.h"
 
 using std::tr1::make_tuple;
@@ -24,7 +28,9 @@
 #define VIDEO_NAME 0
 #define THREADS 1
 
+const int kMaxPsnr = 100;
 const double kUsecsInSec = 1000000.0;
+const char kNewEncodeOutputFile[] = "new_encode.ivf";
 
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
@@ -74,7 +80,7 @@
   libvpx_test::WebMVideoSource video(video_name);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
@@ -105,4 +111,163 @@
 INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
                         ::testing::ValuesIn(kVP9DecodePerfVectors));
 
+class VP9NewEncodeDecodePerfTest :
+    public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  VP9NewEncodeDecodePerfTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        speed_(0),
+        outfile_(0),
+        out_frames_(0) {
+  }
+
+  virtual ~VP9NewEncodeDecodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 25;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.rc_end_usage = VPX_VBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, speed_);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
+    const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+    outfile_ = fopen(path_to_source.c_str(), "wb");
+    ASSERT_TRUE(outfile_ != NULL);
+  }
+
+  virtual void EndPassHook() {
+    if (outfile_ != NULL) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+
+    // Write frame header and data.
+    ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+              pkt->data.frame.sz);
+  }
+
+  virtual bool DoDecode() { return false; }
+
+  void set_speed(unsigned int speed) {
+    speed_ = speed;
+  }
+
+ private:
+  libvpx_test::TestMode encoding_mode_;
+  uint32_t speed_;
+  FILE *outfile_;
+  uint32_t out_frames_;
+};
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_),
+        width(width_),
+        height(height_),
+        bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
+  SetUp();
+
+  // TODO(JBB): Make this work by going through the set of given files.
+  const int i = 0;
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  const char *video_name = kVP9EncodePerfTestVectors[i].name;
+  libvpx_test::I420VideoSource video(
+      video_name,
+      kVP9EncodePerfTestVectors[i].width,
+      kVP9EncodePerfTestVectors[i].height,
+      timebase.den, timebase.num, 0,
+      kVP9EncodePerfTestVectors[i].frames);
+  set_speed(2);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  const uint32_t threads = 4;
+
+  libvpx_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+  decode_video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  vpx_usec_timer t;
+  vpx_usec_timer_start(&t);
+
+  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+       decode_video.Next()) {
+    decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+  }
+
+  vpx_usec_timer_mark(&t);
+  const double elapsed_secs =
+      static_cast<double>(vpx_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned decode_frames = decode_video.frame_number();
+  const double fps = static_cast<double>(decode_frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", decode_frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+  VP9NewEncodeDecodePerfTest, ::testing::Values(::libvpx_test::kTwoPassGood));
 }  // namespace

diff --git a/libvpx/test/decode_test_driver.cc b/libvpx/test/decode_test_driver.cc
index 99610eb..ad861c3 100644
--- a/libvpx/test/decode_test_driver.cc
+++ b/libvpx/test/decode_test_driver.cc

@@ -7,9 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/register_state_check.h"
 #include "test/video_source.h"
 
@@ -65,7 +67,7 @@
 
 void DecoderTest::RunLoop(CompressedVideoSource *video,
                           const vpx_codec_dec_cfg_t &dec_cfg) {
-  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
+  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, flags_, 0);
   ASSERT_TRUE(decoder != NULL);
   bool end_of_file = false;
 
@@ -106,8 +108,16 @@
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   RunLoop(video, dec_cfg);
 }
 
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+  flags_ = flags;
+}
+
 }  // namespace libvpx_test

diff --git a/libvpx/test/decode_test_driver.h b/libvpx/test/decode_test_driver.h
index 1f73c7d..f566c53 100644
--- a/libvpx/test/decode_test_driver.h
+++ b/libvpx/test/decode_test_driver.h

@@ -41,7 +41,13 @@
 class Decoder {
  public:
   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline), init_done_(false) {
+      : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+          unsigned long deadline)  // NOLINT
+      : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
     memset(&decoder_, 0, sizeof(decoder_));
   }
 
@@ -66,9 +72,7 @@
   }
 
   void Control(int ctrl_id, int arg) {
-    InitOnce();
-    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
-    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+    Control(ctrl_id, arg, VPX_CODEC_OK);
   }
 
   void Control(int ctrl_id, const void *arg) {
@@ -77,6 +81,12 @@
     ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
   }
 
+  void Control(int ctrl_id, int arg, vpx_codec_err_t expected_value) {
+    InitOnce();
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(expected_value, res) << DecodeError();
+  }
+
   const char* DecodeError() {
     const char *detail = vpx_codec_error_detail(&decoder_);
     return detail ? detail : vpx_codec_error(&decoder_);
@@ -97,6 +107,10 @@
 
   bool IsVP8() const;
 
+  vpx_codec_ctx_t * GetDecoder() {
+    return &decoder_;
+  }
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const = 0;
 
@@ -104,7 +118,7 @@
     if (!init_done_) {
       const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
                                                      CodecInterface(),
-                                                     &cfg_, 0);
+                                                     &cfg_, flags_);
       ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
       init_done_ = true;
     }
@@ -112,6 +126,7 @@
 
   vpx_codec_ctx_t     decoder_;
   vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
   unsigned int        deadline_;
   bool                init_done_;
 };
@@ -124,21 +139,24 @@
   virtual void RunLoop(CompressedVideoSource *video,
                        const vpx_codec_dec_cfg_t &dec_cfg);
 
+  virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const vpx_codec_flags_t flags);
+
   // Hook to be called before decompressing every frame.
-  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
-                                  Decoder *decoder) {}
+  virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+                                  Decoder* /*decoder*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const CompressedVideoSource& /* video */,
+                                  const CompressedVideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     const unsigned int frame_number) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     const unsigned int /*frame_number*/) {}
 
   // Hook to be called on peek result
   virtual void HandlePeekResult(Decoder* const decoder,
@@ -146,11 +164,16 @@
                                 const vpx_codec_err_t res_peek);
 
  protected:
-  explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}
+  explicit DecoderTest(const CodecFactory *codec)
+      : codec_(codec),
+        cfg_(),
+        flags_(0) {}
 
   virtual ~DecoderTest() {}
 
   const CodecFactory *codec_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
 };
 
 }  // namespace libvpx_test

diff --git a/libvpx/test/encode_perf_test.cc b/libvpx/test/encode_perf_test.cc
index feef37e..7e9f0d6 100644
--- a/libvpx/test/encode_perf_test.cc
+++ b/libvpx/test/encode_perf_test.cc

@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "./vpx_version.h"
@@ -50,7 +51,8 @@
   EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
 };
 
-const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 12 };
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestThreads[] = { 1, 2, 4 };
 
 #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
 
@@ -63,7 +65,8 @@
         min_psnr_(kMaxPsnr),
         nframes_(0),
         encoding_mode_(GET_PARAM(1)),
-        speed_(0) {}
+        speed_(0),
+        threads_(1) {}
 
   virtual ~VP9EncodePerfTest() {}
 
@@ -82,12 +85,18 @@
     cfg_.rc_buf_optimal_sz = 600;
     cfg_.rc_resize_allowed = 0;
     cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = threads_;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
+      const int log2_tile_columns = 3;
       encoder->Control(VP8E_SET_CPUUSED, speed_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, log2_tile_columns);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
     }
   }
 
@@ -113,54 +122,77 @@
     speed_ = speed;
   }
 
+  void set_threads(unsigned int threads) {
+    threads_ = threads;
+  }
+
  private:
   double min_psnr_;
   unsigned int nframes_;
   libvpx_test::TestMode encoding_mode_;
   unsigned speed_;
+  unsigned int threads_;
 };
 
 TEST_P(VP9EncodePerfTest, PerfTest) {
   for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) {
     for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
-      SetUp();
+      for (size_t k = 0; k < NELEMENTS(kEncodePerfTestThreads); ++k) {
+        if (kVP9EncodePerfTestVectors[i].width < 512 &&
+            kEncodePerfTestThreads[k] > 1)
+          continue;
+        else if (kVP9EncodePerfTestVectors[i].width < 1024 &&
+                 kEncodePerfTestThreads[k] > 2)
+          continue;
 
-      const vpx_rational timebase = { 33333333, 1000000000 };
-      cfg_.g_timebase = timebase;
-      cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+        set_threads(kEncodePerfTestThreads[k]);
+        SetUp();
 
-      init_flags_ = VPX_CODEC_USE_PSNR;
+        const vpx_rational timebase = { 33333333, 1000000000 };
+        cfg_.g_timebase = timebase;
+        cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
 
-      const unsigned frames = kVP9EncodePerfTestVectors[i].frames;
-      const char *video_name = kVP9EncodePerfTestVectors[i].name;
-      libvpx_test::I420VideoSource video(
-          video_name,
-          kVP9EncodePerfTestVectors[i].width,
-          kVP9EncodePerfTestVectors[i].height,
-          timebase.den, timebase.num, 0,
-          kVP9EncodePerfTestVectors[i].frames);
-      set_speed(kEncodePerfTestSpeeds[j]);
+        init_flags_ = VPX_CODEC_USE_PSNR;
 
-      vpx_usec_timer t;
-      vpx_usec_timer_start(&t);
+        const unsigned frames = kVP9EncodePerfTestVectors[i].frames;
+        const char *video_name = kVP9EncodePerfTestVectors[i].name;
+        libvpx_test::I420VideoSource video(
+            video_name,
+            kVP9EncodePerfTestVectors[i].width,
+            kVP9EncodePerfTestVectors[i].height,
+            timebase.den, timebase.num, 0,
+            kVP9EncodePerfTestVectors[i].frames);
+        set_speed(kEncodePerfTestSpeeds[j]);
 
-      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+        vpx_usec_timer t;
+        vpx_usec_timer_start(&t);
 
-      vpx_usec_timer_mark(&t);
-      const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec;
-      const double fps = frames / elapsed_secs;
-      const double minimum_psnr = min_psnr();
+        ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-      printf("{\n");
-      printf("\t\"type\" : \"encode_perf_test\",\n");
-      printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
-      printf("\t\"videoName\" : \"%s\",\n", video_name);
-      printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
-      printf("\t\"totalFrames\" : %u,\n", frames);
-      printf("\t\"framesPerSecond\" : %f,\n", fps);
-      printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
-      printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
-      printf("}\n");
+        vpx_usec_timer_mark(&t);
+        const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec;
+        const double fps = frames / elapsed_secs;
+        const double minimum_psnr = min_psnr();
+        std::string display_name(video_name);
+        if (kEncodePerfTestThreads[k] > 1) {
+          char thread_count[32];
+          snprintf(thread_count, sizeof(thread_count), "_t-%d",
+                   kEncodePerfTestThreads[k]);
+          display_name += thread_count;
+        }
+
+        printf("{\n");
+        printf("\t\"type\" : \"encode_perf_test\",\n");
+        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
+        printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+        printf("\t\"totalFrames\" : %u,\n", frames);
+        printf("\t\"framesPerSecond\" : %f,\n", fps);
+        printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+        printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
+        printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
+        printf("}\n");
+      }
     }
   }
 }

diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index 6d4281d..be4ef9a 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc

@@ -8,15 +8,59 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/decode_test_driver.h"
-#include "test/register_state_check.h"
-#include "test/video_source.h"
+#include <string>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
 namespace libvpx_test {
+void Encoder::InitEncoder(VideoSource *video) {
+  vpx_codec_err_t res;
+  const vpx_image_t *img = video->img();
+
+  if (video->img() && !encoder_.priv) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    cfg_.g_timebase = video->timebase();
+    cfg_.rc_twopass_stats_in = stats_->buf();
+
+    res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,
+                             init_flags_);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+
+#if CONFIG_VP9_ENCODER
+    if (CodecInterface() == &vpx_codec_vp9_cx_algo) {
+      // Default to 1 tile column for VP9.
+      const int log2_tile_columns = 0;
+      res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
+                               log2_tile_columns);
+      ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    } else
+#endif
+#if CONFIG_VP10_ENCODER
+    if (CodecInterface() == &vpx_codec_vp10_cx_algo) {
+      // Default to 1 tile column for VP10.
+      const int log2_tile_columns = 0;
+      res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
+                               log2_tile_columns);
+      ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    } else
+#endif
+    {
+#if CONFIG_VP8_ENCODER
+      ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
+          << "Unknown Codec Interface";
+#endif
+    }
+  }
+}
+
 void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
   if (video->img())
     EncodeFrameInternal(*video, frame_flags);
@@ -39,17 +83,6 @@
   vpx_codec_err_t res;
   const vpx_image_t *img = video.img();
 
-  // Handle first frame initialization
-  if (!encoder_.priv) {
-    cfg_.g_w = img->d_w;
-    cfg_.g_h = img->d_h;
-    cfg_.g_timebase = video.timebase();
-    cfg_.rc_twopass_stats_in = stats_->buf();
-    res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,
-                             init_flags_);
-    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
-  }
-
   // Handle frame resizing
   if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
     cfg_.g_w = img->d_w;
@@ -60,8 +93,7 @@
 
   // Encode the frame
   API_REGISTER_STATE_CHECK(
-      res = vpx_codec_encode(&encoder_,
-                             video.img(), video.pts(), video.duration(),
+      res = vpx_codec_encode(&encoder_, img, video.pts(), video.duration(),
                              frame_flags, deadline_));
   ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
 }
@@ -77,6 +109,7 @@
 
 void EncoderTest::InitializeConfig() {
   const vpx_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);
+  dec_cfg_ = vpx_codec_dec_cfg_t();
   ASSERT_EQ(VPX_CODEC_OK, res);
 }
 
@@ -110,6 +143,7 @@
 static bool compare_img(const vpx_image_t *img1,
                         const vpx_image_t *img2) {
   bool match = (img1->fmt == img2->fmt) &&
+               (img1->cs == img2->cs) &&
                (img1->d_w == img2->d_w) &&
                (img1->d_h == img2->d_h);
 
@@ -133,13 +167,13 @@
   return match;
 }
 
-void EncoderTest::MismatchHook(const vpx_image_t *img1,
-                               const vpx_image_t *img2) {
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+                               const vpx_image_t* /*img2*/) {
   ASSERT_TRUE(0) << "Encode/Decode mismatch found";
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
 
   stats_.Reset();
 
@@ -158,9 +192,18 @@
     Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_,
                                                    &stats_);
     ASSERT_TRUE(encoder != NULL);
-    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
+
+    video->Begin();
+    encoder->InitEncoder(video);
+
+    unsigned long dec_init_flags = 0;  // NOLINT
+    // Use fragment decoder if encoder outputs partitions.
+    // NOTE: fragment decoder and partition encoder are only supported by VP8.
+    if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
+      dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
+    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
     bool again;
-    for (again = true, video->Begin(); again; video->Next()) {
+    for (again = true; again; video->Next()) {
       again = (video->img() != NULL);
 
       PreEncodeFrameHook(video);
@@ -200,6 +243,13 @@
         }
       }
 
+      // Flush the decoder when there are no more fragments.
+      if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
+        const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+        if (!HandleDecodeResult(res_dec, *video, decoder))
+          break;
+      }
+
       if (has_dxdata && has_cxdata) {
         const vpx_image_t *img_enc = encoder->GetPreviewFrame();
         DxDataIterator dec_iter = decoder->GetDxData();

diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 2270ce2..9ecc498 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h

@@ -13,12 +13,13 @@
 #include <string>
 #include <vector>
 
-#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx/vpx_encoder.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+
+#include "./vpx_config.h"
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 #include "vpx/vp8cx.h"
 #endif
+#include "vpx/vpx_encoder.h"
 
 namespace libvpx_test {
 
@@ -104,6 +105,8 @@
     return CxDataIterator(&encoder_);
   }
 
+  void InitEncoder(VideoSource *video);
+
   const vpx_image_t *GetPreviewFrame() {
     return vpx_codec_get_preview_frame(&encoder_);
   }
@@ -131,13 +134,23 @@
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
   void Control(int ctrl_id, vpx_active_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 #endif
 
+  void Config(const vpx_codec_enc_cfg_t *cfg) {
+    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    cfg_ = *cfg;
+  }
+
   void set_deadline(unsigned long deadline) {
     deadline_ = deadline;
   }
@@ -175,7 +188,10 @@
  protected:
   explicit EncoderTest(const CodecFactory *codec)
       : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {}
+        last_pts_(0) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }
 
   virtual ~EncoderTest() {}
 
@@ -185,24 +201,30 @@
   // Map the TestMode enum to the deadline_ and passes_ variables.
   void SetMode(TestMode mode);
 
+  // Set encoder flag.
+  void set_init_flags(unsigned long flag) {  // NOLINT(runtime/int)
+    init_flags_ = flag;
+  }
+
   // Main loop
   virtual void RunLoop(VideoSource *video);
 
   // Hook to be called at the beginning of a pass.
-  virtual void BeginPassHook(unsigned int pass) {}
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
 
   // Hook to be called at the end of a pass.
   virtual void EndPassHook() {}
 
   // Hook to be called before encoding a frame.
-  virtual void PreEncodeFrameHook(VideoSource *video) {}
-  virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+                                  Encoder* /*encoder*/) {}
 
   // Hook to be called on every compressed data packet.
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
@@ -218,25 +240,26 @@
                             const vpx_image_t *img2);
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     vpx_codec_pts_t pts) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     vpx_codec_pts_t /*pts*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const VideoSource& /* video */,
+                                  const VideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook that can modify the encoder's output data
-  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
       const vpx_codec_cx_pkt_t *pkt) {
     return pkt;
   }
 
   bool                 abort_;
   vpx_codec_enc_cfg_t  cfg_;
+  vpx_codec_dec_cfg_t  dec_cfg_;
   unsigned int         passes_;
   unsigned long        deadline_;
   TwopassStatsStore    stats_;

diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index 89684f8..9e512ad 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc

@@ -37,6 +37,7 @@
   void Reset() {
     error_nframes_ = 0;
     droppable_nframes_ = 0;
+    pattern_switch_ = 0;
   }
 
   virtual void SetUp() {
@@ -56,22 +57,77 @@
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
+  //
+  // Frame flags and layer id for temporal layers.
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // LAST is updated on base/layer 0, GOLDEN  updated on layer 1.
+  // Non-zero pattern_switch parameter means pattern will switch to
+  // not using LAST for frame_num >= pattern_switch.
+  int SetFrameFlags(int frame_num,
+                    int num_temp_layers,
+                    int pattern_switch) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+        if (frame_num % 2 == 0) {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 0: predict from LAST and ARF, update LAST.
+            frame_flags = VP8_EFLAG_NO_REF_GF |
+                          VP8_EFLAG_NO_UPD_GF |
+                          VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            // Layer 0: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        } else {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 1: predict from L, GF, and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_UPD_LAST;
+          } else {
+            // Layer 1: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        }
+    }
+    return frame_flags;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
     frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST |
                       VP8_EFLAG_NO_UPD_GF |
                       VP8_EFLAG_NO_UPD_ARF);
-    if (droppable_nframes_ > 0 &&
-        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
+    // For temporal layer case.
+    if (cfg_.ts_number_layers > 1) {
+      frame_flags_ = SetFrameFlags(video->frame(),
+                                   cfg_.ts_number_layers,
+                                   pattern_switch_);
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
         if (droppable_frames_[i] == video->frame()) {
-          std::cout << "             Encoding droppable frame: "
+          std::cout << "Encoding droppable frame: "
                     << droppable_frames_[i] << "\n";
-          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
-                           VP8_EFLAG_NO_UPD_GF |
-                           VP8_EFLAG_NO_UPD_ARF);
-          return;
         }
       }
+    } else {
+       if (droppable_nframes_ > 0 &&
+         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
+         for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+           if (droppable_frames_[i] == video->frame()) {
+             std::cout << "Encoding droppable frame: "
+                       << droppable_frames_[i] << "\n";
+             frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
+                              VP8_EFLAG_NO_UPD_GF |
+                              VP8_EFLAG_NO_UPD_ARF);
+             return;
+           }
+         }
+       }
     }
   }
 
@@ -133,11 +189,16 @@
     return mismatch_nframes_;
   }
 
+  void SetPatternSwitch(int frame_switch) {
+     pattern_switch_ = frame_switch;
+   }
+
  private:
   double psnr_;
   unsigned int nframes_;
   unsigned int error_nframes_;
   unsigned int droppable_nframes_;
+  unsigned int pattern_switch_;
   double mismatch_psnr_;
   unsigned int mismatch_nframes_;
   unsigned int error_frames_[kMaxErrorFrames];
@@ -236,7 +297,291 @@
 #endif
 }
 
-VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) the enhancement layer frames for a
+// two layer temporal pattern. The base layer does not predict from the top
+// layer, so successful decoding is expected.
+TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
 
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 40);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(0);
+
+  // The odd frames are the enhancement layer for 2 layer pattern, so set
+  // those frames as droppable. Drop the last 7 frames.
+  unsigned int num_droppable_frames = 7;
+  unsigned int droppable_frame_list[] = {27, 29, 31, 33, 35, 37, 39};
+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+  SetErrorFrames(num_droppable_frames, droppable_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// for a two layer temporal pattern, where at some point in the
+// sequence, the LAST ref is not used anymore.
+TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
+
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 100);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(60);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
+class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  ErrorResilienceTestLargeCodecControls()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)) {
+    Reset();
+  }
+
+  virtual ~ErrorResilienceTestLargeCodecControls() {}
+
+  void Reset() {
+    last_pts_ = 0;
+    tot_frame_number_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    duration_ = 0.0;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1,
+  // and ALTREF is updated on top layer for 3 layer pattern.
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
+                      VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_REF_ARF;
+      }  else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+         layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (cfg_.ts_number_layers > 1) {
+        int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+        int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        if (video->frame() > 0) {
+          encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+          encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+        }
+       const vpx_rational_t tb = video->timebase();
+       timebase_ = static_cast<double>(tb.num) / tb.den;
+       duration_ = 0;
+       return;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (duration > 1) {
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = (last_pts_ + 1) * timebase_;
+    if (cfg_.ts_number_layers  > 1) {
+      for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+          ++layer) {
+        if (bits_total_[layer]) {
+          // Effective file datarate:
+          effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+        }
+      }
+    }
+  }
+
+  double effective_datarate_[3];
+   private:
+    libvpx_test::TestMode encoding_mode_;
+    vpx_codec_pts_t last_pts_;
+    double timebase_;
+    int64_t bits_total_[3];
+    double duration_;
+    int tot_frame_number_;
+  };
+
+// Check two codec controls used for:
+// (1) for setting temporal layer id, and (2) for settings encoder flags.
+// This test invokes those controls for each frame, and verifies encoder/decoder
+// mismatch and basic rate control response.
+// TODO(marpan): Maybe move this test to datarate_test.cc.
+TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  cfg_.g_error_resilient = 1;
+
+  // 3 Temporal layers. Framerate decimation (4, 2, 1).
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.ts_periodicity = 4;
+  cfg_.ts_layer_id[0] = 0;
+  cfg_.ts_layer_id[1] = 2;
+  cfg_.ts_layer_id[2] = 1;
+  cfg_.ts_layer_id[3] = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  for (int i = 200; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    Reset();
+    // 40-20-40 bitrate allocation for 3 temporal layers.
+    cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+    cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
+          << " The datarate for the file is lower than target by too much, "
+              "for layer: " << j;
+      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
+          << " The datarate for the file is greater than target by too much, "
+              "for layer: " << j;
+    }
+  }
+}
+
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
+                          ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
+VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
 }  // namespace

diff --git a/libvpx/test/examples.sh b/libvpx/test/examples.sh
index 7ba9cce..39f7e39 100755
--- a/libvpx/test/examples.sh
+++ b/libvpx/test/examples.sh

@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples vpxdec vpxenc tools_common"
+exclude_list="examples tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do

diff --git a/libvpx/test/external_frame_buffer_test.cc b/libvpx/test/external_frame_buffer_test.cc
index fb0449d..d02dca2 100644
--- a/libvpx/test/external_frame_buffer_test.cc
+++ b/libvpx/test/external_frame_buffer_test.cc

@@ -71,6 +71,7 @@
     if (ext_fb_list_[idx].size < min_size) {
       delete [] ext_fb_list_[idx].data;
       ext_fb_list_[idx].data = new uint8_t[min_size];
+      memset(ext_fb_list_[idx].data, 0, min_size);
       ext_fb_list_[idx].size = min_size;
     }
 
@@ -96,13 +97,19 @@
     return 0;
   }
 
-  // Marks the external frame buffer that |fb| is pointing too as free.
+  // Marks the external frame buffer that |fb| is pointing to as free.
   // Returns < 0 on an error.
   int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    if (fb == NULL) {
+      EXPECT_TRUE(fb != NULL);
+      return -1;
+    }
     ExternalFrameBuffer *const ext_fb =
         reinterpret_cast<ExternalFrameBuffer*>(fb->priv);
-    EXPECT_TRUE(ext_fb != NULL);
+    if (ext_fb == NULL) {
+      EXPECT_TRUE(ext_fb != NULL);
+      return -1;
+    }
     EXPECT_EQ(1, ext_fb->in_use);
     ext_fb->in_use = 0;
     return 0;
@@ -285,7 +292,7 @@
     video_->Init();
     video_->Begin();
 
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }

diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
index 7c48260..3f6b738 100644
--- a/libvpx/test/fdct4x4_test.cc
+++ b/libvpx/test/fdct4x4_test.cc

@@ -13,53 +13,90 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
-
-extern "C" {
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
 const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
 
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_fdct4x4_c(in, out, stride);
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
+  vpx_fdct4x4_c(in, out, stride);
 }
 
-void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fht4x4_c(in, out, stride, tx_type);
 }
 
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
   vp9_fwht4x4_c(in, out, stride);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
+}
+
+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
+}
+
+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+}
+
+void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 class Trans4x4TestBase {
  public:
   virtual ~Trans4x4TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
 
-  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunAccuracyCheck(int limit) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -67,24 +104,49 @@
     int64_t total_error = 0;
     const int count_test_block = 10000;
     for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
 
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                           test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        ASSERT_EQ(VPX_BITS_8, bit_depth_);
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -104,14 +166,14 @@
   void RunCoeffCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 5000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -125,23 +187,21 @@
   void RunMemCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 5000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -151,8 +211,8 @@
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
         EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
       }
     }
   }
@@ -160,25 +220,49 @@
   void RunInvAccuracyCheck(int limit) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       fwd_txfm_ref(in, coeff, pitch_, tx_type_);
 
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         EXPECT_GE(static_cast<uint32_t>(limit), error)
             << "Error: 4x4 IDCT has error " << error
@@ -190,6 +274,8 @@
   int pitch_;
   int tx_type_;
   FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
 };
 
 class Trans4x4DCT
@@ -204,14 +290,16 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fdct4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -247,15 +335,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
 
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -291,14 +381,16 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -323,57 +415,140 @@
 }
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_neon, 0)));
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
-#endif
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if CONFIG_USE_X86INC && HAVE_MMX
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MMX, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct4x4_sse2,
-                   &vp9_idct4x4_16_add_sse2, 0)));
+        make_tuple(&vpx_fdct4x4_sse2,
+                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
-#endif
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_sse2,      &vpx_idct4x4_16_add_c, 0,
+                   VPX_BITS_8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index 567e5f6..c0deaf4 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc

@@ -13,52 +13,139 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
-
-extern "C" {
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+
+const int kNumCoeffs = 64;
+const double kPi = 3.141592653589793238462643383279502884;
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
 
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
-  vp9_fdct8x8_c(in, out, stride);
+void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 8; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 8; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
 }
 
-void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
+                          double output[kNumCoeffs]) {
+  // First transform columns
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = input[j*8 + i];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 8; ++j)
+      output[j * 8 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = output[j + i*8];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] * 2;
+  }
+}
+
+
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  vpx_fdct8x8_c(in, out, stride);
+}
+
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fht8x8_c(in, out, stride, tx_type);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+}
+
+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+}
+
+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+}
+
+void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+}
+
+void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 class FwdTrans8x8TestBase {
  public:
   virtual ~FwdTrans8x8TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunSignBiasCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_output_block[64]);
     int count_sign_block[64][2];
     const int count_test_block = 100000;
 
@@ -67,7 +154,8 @@
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < 64; ++j)
-        test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
+                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
       ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_output_block, pitch_));
 
@@ -81,8 +169,8 @@
 
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = 1125;
-      EXPECT_LT(diff, max_diff)
+      const int max_diff = kSignBiasMaxDiff255;
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -94,9 +182,10 @@
     memset(count_sign_block, 0, sizeof(count_sign_block));
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-15, 15].
+      // Initialize a test block with input range [-mask_ / 16, mask_ / 16].
       for (int j = 0; j < 64; ++j)
-        test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+        test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
+                              ((rnd.Rand16() & mask_) >> 4);
       ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_output_block, pitch_));
 
@@ -110,9 +199,9 @@
 
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = 10000;
-      EXPECT_LT(diff, max_diff)
-          << "Error: 4x4 FDCT/FHT has a sign bias > "
+      const int max_diff = kSignBiasMaxDiff15;
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+          << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
           << " count0: " << count_sign_block[j][0]
@@ -126,17 +215,29 @@
     int max_error = 0;
     int total_error = 0;
     const int count_test_block = 100000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < 64; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(
@@ -152,11 +253,23 @@
             test_temp_block[j] *= 4;
           }
       }
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const int diff = dst[j] - src[j];
+#endif
         const int error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -164,11 +277,11 @@
       }
     }
 
-    EXPECT_GE(1, max_error)
+    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
       << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
       << " roundtrip error > 1";
 
-    EXPECT_GE(count_test_block/5, total_error)
+    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
       << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
       << "error > 1/5 per block";
   }
@@ -179,38 +292,69 @@
     int total_error = 0;
     int total_coeff_error = 0;
     const int count_test_block = 100000;
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < 64; ++j) {
-        if (i == 0) {
-          src[j] = 255;
-          dst[j] = 0;
-        } else if (i == 1) {
-          src[j] = 0;
-          dst[j] = 255;
+        if (bit_depth_ == VPX_BITS_8) {
+          if (i == 0) {
+            src[j] = 255;
+            dst[j] = 0;
+          } else if (i == 1) {
+            src[j] = 0;
+            dst[j] = 255;
+          } else {
+            src[j] = rnd.Rand8() % 2 ? 255 : 0;
+            dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          }
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
         } else {
-          src[j] = rnd.Rand8() % 2 ? 255 : 0;
-          dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          if (i == 0) {
+            src16[j] = mask_;
+            dst16[j] = 0;
+          } else if (i == 1) {
+            src16[j] = 0;
+            dst16[j] = mask_;
+          } else {
+            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          }
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
         }
-
-        test_input_block[j] = src[j] - dst[j];
       }
 
       ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       ASM_REGISTER_STATE_CHECK(
           fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const int diff = dst[j] - src[j];
+#endif
         const int error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -220,11 +364,11 @@
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1, max_error)
+      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
           << "an individual roundtrip error > 1";
 
-      EXPECT_GE(count_test_block/5, total_error)
+      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
@@ -234,9 +378,154 @@
     }
   }
 
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8() % 2 ? 255 : 0;
+          dst[j] = src[j] > 0 ? 0 : 255;
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          dst16[j] = src16[j] > 0 ? 0 : mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff_r[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
+
+      RunFwdTxfm(in, coeff, pitch_);
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = coeff[j] - coeff_r[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 DCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 12;
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif
+    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
   int pitch_;
   int tx_type_;
   FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
 };
 
 class FwdTrans8x8DCT
@@ -251,15 +540,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 8;
     fwd_txfm_ref = fdct8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -279,6 +570,14 @@
   RunExtremalCheck();
 }
 
+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
 class FwdTrans8x8HT
     : public FwdTrans8x8TestBase,
       public ::testing::TestWithParam<Ht8x8Param> {
@@ -291,15 +590,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 8;
     fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -319,52 +620,170 @@
   RunExtremalCheck();
 }
 
+class InvTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Idct8x8Param> {
+ public:
+  virtual ~InvTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    pitch_ = 8;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans8x8DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
-#endif
+        make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSE2
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 0,
+                   VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
-#endif
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+
+// Optimizations take effect at a threshold of 6201, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&idct8x8_10_add_10_c,
+                   &idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10,
+                   &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10_add_12_c,
+                   &idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
+        make_tuple(&idct8x8_12,
+                   &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+    !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+        make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_ssse3, 0,
+                   VPX_BITS_8)));
 #endif
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/libvpx/test/frame_size_tests.cc b/libvpx/test/frame_size_tests.cc
index 2400c20..95cc66a 100644
--- a/libvpx/test/frame_size_tests.cc
+++ b/libvpx/test/frame_size_tests.cc

@@ -27,7 +27,7 @@
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource &video,
+                                  const libvpx_test::VideoSource& /*video*/,
                                   libvpx_test::Decoder *decoder) {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
@@ -72,10 +72,25 @@
   // one for each lag in frames (for 2 pass), and then one for each possible
   // reference buffer (8) - we can end up with up to 30 buffers of roughly this
   // size or almost 1 gig of memory.
+  // In total the allocations will exceed 2GiB which may cause a failure with
+  // mingw + wine, use a smaller size in that case.
+#if defined(_WIN32) && !defined(_WIN64)
+  video.SetSize(4096, 3072);
+#else
   video.SetSize(4096, 4096);
+#endif
   video.set_limit(2);
   expected_res_ = VPX_CODEC_OK;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #endif
 }
+
+TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) {
+  ::libvpx_test::RandomVideoSource video;
+
+  video.SetSize(1, 1);
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
 }  // namespace

diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h
index c3315f9..0a18480 100644
--- a/libvpx/test/i420_video_source.h
+++ b/libvpx/test/i420_video_source.h

@@ -13,104 +13,22 @@
 #include <cstdlib>
 #include <string>
 
-#include "test/video_source.h"
+#include "test/yuv_video_source.h"
 
 namespace libvpx_test {
 
 // This class extends VideoSource to allow parsing of raw yv12
 // so that we can do actual file encodes.
-class I420VideoSource : public VideoSource {
+class I420VideoSource : public YUVVideoSource {
  public:
   I420VideoSource(const std::string &file_name,
                   unsigned int width, unsigned int height,
                   int rate_numerator, int rate_denominator,
                   unsigned int start, int limit)
-      : file_name_(file_name),
-        input_file_(NULL),
-        img_(NULL),
-        start_(start),
-        limit_(limit),
-        frame_(0),
-        width_(0),
-        height_(0),
-        framerate_numerator_(rate_numerator),
-        framerate_denominator_(rate_denominator) {
-    // This initializes raw_sz_, width_, height_ and allocates an img.
-    SetSize(width, height);
-  }
-
-  virtual ~I420VideoSource() {
-    vpx_img_free(img_);
-    if (input_file_)
-      fclose(input_file_);
-  }
-
-  virtual void Begin() {
-    if (input_file_)
-      fclose(input_file_);
-    input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-        << file_name_;
-    if (start_) {
-      fseek(input_file_, static_cast<unsigned>(raw_sz_) * start_, SEEK_SET);
-    }
-
-    frame_ = start_;
-    FillFrame();
-  }
-
-  virtual void Next() {
-    ++frame_;
-    FillFrame();
-  }
-
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
-
-  // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
-
-  virtual unsigned long duration() const { return 1; }
-
-  virtual vpx_rational_t timebase() const {
-    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
-    return t;
-  }
-
-  virtual unsigned int frame() const { return frame_; }
-
-  virtual unsigned int limit() const { return limit_; }
-
-  void SetSize(unsigned int width, unsigned int height) {
-    if (width != width_ || height != height_) {
-      vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
-      width_ = width;
-      height_ = height;
-      raw_sz_ = width * height * 3 / 2;
-    }
-  }
-
-  virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
-    // Read a frame from input_file.
-    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
-      limit_ = frame_;
-    }
-  }
-
- protected:
-  std::string file_name_;
-  FILE *input_file_;
-  vpx_image_t *img_;
-  size_t raw_sz_;
-  unsigned int start_;
-  unsigned int limit_;
-  unsigned int frame_;
-  unsigned int width_;
-  unsigned int height_;
-  int framerate_numerator_;
-  int framerate_denominator_;
+      : YUVVideoSource(file_name, VPX_IMG_FMT_I420,
+                       width, height,
+                       rate_numerator, rate_denominator,
+                       start, limit) {}
 };
 
 }  // namespace libvpx_test

diff --git a/libvpx/test/idct8x8_test.cc b/libvpx/test/idct8x8_test.cc
index 5f4c33a..987ba75 100644
--- a/libvpx/test/idct8x8_test.cc
+++ b/libvpx/test/idct8x8_test.cc

@@ -14,8 +14,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vp9_rtcd.h"
-
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
@@ -109,7 +108,8 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 10000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t input[64], coeff[64];
+    int16_t input[64];
+    tran_low_t coeff[64];
     double output_r[64];
     uint8_t dst[64], src[64];
 
@@ -124,7 +124,7 @@
     reference_dct_2d(input, output_r);
     for (int j = 0; j < 64; ++j)
       coeff[j] = round(output_r[j]);
-    vp9_idct8x8_64_add_c(coeff, dst, 8);
+    vpx_idct8x8_64_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;

diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 2ff9e64..39db3e4 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc

@@ -10,10 +10,11 @@
 
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx/vpx_integer.h"
 
 typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
@@ -113,4 +114,8 @@
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
 #endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_msa));
+#endif
 }

diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc
index ead4760..65a0697 100644
--- a/libvpx/test/intrapred_test.cc
+++ b/libvpx/test/intrapred_test.cc

@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <string.h>
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -294,6 +294,16 @@
                         ::testing::Values(
                             vp8_build_intra_predictors_mby_s_ssse3));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_msa));
+#endif
 
 typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
                                 uint8_t *uabove_row,
@@ -382,5 +392,15 @@
                         ::testing::Values(
                             vp8_build_intra_predictors_mbuv_s_ssse3));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_msa));
+#endif
 
 }  // namespace

diff --git a/libvpx/test/invalid_file_test.cc b/libvpx/test/invalid_file_test.cc
index 0a1c17c..1b5ef5c 100644
--- a/libvpx/test/invalid_file_test.cc
+++ b/libvpx/test/invalid_file_test.cc

@@ -73,7 +73,7 @@
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
     libvpx_test::CompressedVideoSource *video = NULL;
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.threads = input.threads;
     const std::string filename = input.filename;
 
@@ -112,10 +112,19 @@
 
 const DecodeParam kVP9InvalidFileTests[] = {
   {1, "invalid-vp90-02-v2.webm"},
+#if CONFIG_VP9_HIGHBITDEPTH
   {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
-  {1, "invalid-vp90-03-v2.webm"},
+#endif
+  {1, "invalid-vp90-03-v3.webm"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
+  {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf"},
 };
 
 VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -126,9 +135,9 @@
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const decoder,
-                                libvpx_test::CompressedVideoSource *video,
-                                const vpx_codec_err_t res_peek) {}
+  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                                libvpx_test::CompressedVideoSource* /*video*/,
+                                const vpx_codec_err_t /*res_peek*/) {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
@@ -144,6 +153,11 @@
 
 const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
   {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+  {4, "invalid-"
+      "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+  {4, "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf"},
+  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
 };
 
 INSTANTIATE_TEST_CASE_P(

diff --git a/libvpx/test/lpf_8_test.cc b/libvpx/test/lpf_8_test.cc
new file mode 100644
index 0000000..966e109
--- /dev/null
+++ b/libvpx/test/lpf_8_test.cc

@@ -0,0 +1,716 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count, int bd);
+typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#else
+typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count);
+typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count, int bd) {
+  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count, int bd) {
+  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count, int bd) {
+  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+}
+#else
+void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON_ASM
+
+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
+  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
+class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
+ public:
+  virtual ~Loop8Test6Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    count_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int count_;
+  int mask_;
+  loop_op_t loopfilter_op_;
+  loop_op_t ref_loopfilter_op_;
+};
+
+class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
+ public:
+  virtual ~Loop8Test9Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  dual_loop_op_t loopfilter_op_;
+  dual_loop_op_t ref_loopfilter_op_;
+};
+
+TEST_P(Loop8Test6Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs/32;
+
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
+          } else {  // Decrement by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test6Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+
+  // NOTE: The code in vp9_loopfilter.c:update_sharpness computes mblim as a
+  // function of sharpness_lvl and the loopfilter lvl as:
+  // block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+  // ...
+  // memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+  //        SIMD_WIDTH);
+  // This means that the largest value for mblim will occur when sharpness_lvl
+  // is equal to 0, and lvl is equal to its greatest value (MAX_LOOP_FILTER).
+  // In this case block_inside_limit will be equal to MAX_LOOP_FILTER and
+  // therefore mblim will be equal to (2 * (lvl + 2) + block_inside_limit) =
+  // 2 * (MAX_LOOP_FILTER + 2) + MAX_LOOP_FILTER = 3 * MAX_LOOP_FILTER + 4
+
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8,  uint8_t,  s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t,  ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times.
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by a value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
+          } else {  // Decrement by an value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8,  uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int32_t bd = bit_depth_;
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
+                       thresh0, blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2"
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 12, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 10, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 12, 1)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
+                   2)));
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_sse2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_sse2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_sse2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test6Param,
+    ::testing::Values(
+#if HAVE_NEON_ASM
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_vertical_16_neon,
+                   &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_neon,
+                   &wrapper_vertical_16_dual_c, 8, 1),
+#endif  // HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8, 1)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(
+#if HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                   &vpx_lpf_vertical_8_dual_c, 8),
+#endif  // HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                   &vpx_lpf_vertical_4_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_msa,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_msa,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_msa,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_msa,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
+}  // namespace

diff --git a/libvpx/test/md5_helper.h b/libvpx/test/md5_helper.h
index dc95582..742cf0b7 100644
--- a/libvpx/test/md5_helper.h
+++ b/libvpx/test/md5_helper.h

@@ -28,7 +28,8 @@
       // plane, we never want to round down and thus skip a pixel so if
       // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
       // This works only for chroma_shift of 0 and 1.
-      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
       const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                     img->y_chroma_shift : img->d_h;
       const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
@@ -41,6 +42,10 @@
     }
   }
 
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
   const char *Get(void) {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',

diff --git a/libvpx/test/partial_idct_test.cc b/libvpx/test/partial_idct_test.cc
index 15f4e6c..6c82412 100644
--- a/libvpx/test/partial_idct_test.cc
+++ b/libvpx/test/partial_idct_test.cc

@@ -13,12 +13,13 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_integer.h"
@@ -26,8 +27,8 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tr1::tuple<FwdTxfmFunc,
                         InvTxfmFunc,
                         InvTxfmFunc,
@@ -74,16 +75,16 @@
       FAIL() << "Wrong Size!";
       break;
   }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
 
   const int count_test_block = 1000;
   const int block_size = size * size;
 
-  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
 
   int max_error = 0;
   for (int i = 0; i < count_test_block; ++i) {
@@ -153,10 +154,10 @@
       FAIL() << "Wrong Size!";
       break;
   }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
   const int count_test_block = 1000;
   const int max_coeff = 32766 / 4;
   const int block_size = size * size;
@@ -201,115 +202,142 @@
 INSTANTIATE_TEST_CASE_P(
     C, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_c,
-                   &vp9_idct32x32_34_add_c,
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_c,
                    TX_32X32, 34),
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_c,
-                   &vp9_idct32x32_1_add_c,
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_c,
                    TX_32X32, 1),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_10_add_c,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_c,
                    TX_16X16, 10),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_1_add_c,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_c,
                    TX_16X16, 1),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_12_add_c,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_c,
                    TX_8X8, 12),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_1_add_c,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_c,
                    TX_8X8, 1),
-        make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_c,
-                   &vp9_idct4x4_1_add_c,
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_c,
                    TX_4X4, 1)));
-#if HAVE_NEON_ASM
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_c,
-                   &vp9_idct32x32_1_add_neon,
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_neon,
                    TX_32X32, 1),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_10_add_neon,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_neon,
                    TX_16X16, 10),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_1_add_neon,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_neon,
                    TX_16X16, 1),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_12_add_neon,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_neon,
                    TX_8X8, 12),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_1_add_neon,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_neon,
                    TX_8X8, 1),
-        make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_c,
-                   &vp9_idct4x4_1_add_neon,
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_neon,
                    TX_4X4, 1)));
-#endif
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_c,
-                   &vp9_idct32x32_34_add_sse2,
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_sse2,
                    TX_32X32, 34),
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_c,
-                   &vp9_idct32x32_1_add_sse2,
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_sse2,
                    TX_32X32, 1),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_10_add_sse2,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_sse2,
                    TX_16X16, 10),
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_1_add_sse2,
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_sse2,
                    TX_16X16, 1),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_12_add_sse2,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_sse2,
                    TX_8X8, 12),
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_1_add_sse2,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_sse2,
                    TX_8X8, 1),
-        make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_c,
-                   &vp9_idct4x4_1_add_sse2,
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_sse2,
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+    !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSSE3_64, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c,
-                   &vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_12_add_ssse3,
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_ssse3,
                    TX_8X8, 12)));
 #endif
 
-#if HAVE_SSSE3
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, PartialIDctTest,
+    MSA, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_c,
-                   &vp9_idct16x16_10_add_ssse3,
-                   TX_16X16, 10)));
-#endif
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_msa,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_msa,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_msa,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_msa,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_msa,
+                   TX_8X8, 10),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_msa,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_msa,
+                   TX_4X4, 1)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 }  // namespace

diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index a9b16e0..e4688dd 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc

@@ -63,12 +63,12 @@
   uint8_t *const dst_image_ptr = dst_image + 8;
   uint8_t *const flimits =
       reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
-  (void)vpx_memset(flimits, 255, block_width);
+  (void)memset(flimits, 255, block_width);
 
   // Initialize pixels in the input:
   //   block pixels to value 1,
   //   border pixels to value 10.
-  (void)vpx_memset(src_image, 10, input_size);
+  (void)memset(src_image, 10, input_size);
   uint8_t *pixel_ptr = src_image_ptr;
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
@@ -78,7 +78,7 @@
   }
 
   // Initialize pixels in the output to 99.
-  (void)vpx_memset(dst_image, 99, output_size);
+  (void)memset(dst_image, 99, output_size);
 
   ASM_REGISTER_STATE_CHECK(
       GetParam()(src_image_ptr, dst_image_ptr, input_stride,
@@ -110,4 +110,9 @@
     ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
 #endif
 
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
+#endif
+
 }  // namespace

diff --git a/libvpx/test/quantize_test.cc b/libvpx/test/quantize_test.cc
new file mode 100644
index 0000000..69da899
--- /dev/null
+++ b/libvpx/test/quantize_test.cc

@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/onyx.h"
+#include "vp8/encoder/block.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+const int kNumBlocks = 25;
+const int kNumBlockEntries = 16;
+
+typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);
+
+typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
+
+using libvpx_test::ACMRandom;
+using std::tr1::make_tuple;
+
+// Create and populate a VP8_COMP instance which has a complete set of
+// quantization inputs as well as a second MACROBLOCKD for output.
+class QuantizeTestBase {
+ public:
+  virtual ~QuantizeTestBase() {
+    vp8_remove_compressor(&vp8_comp_);
+    vp8_comp_ = NULL;
+    vpx_free(macroblockd_dst_);
+    macroblockd_dst_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void SetupCompressor() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    // The full configuration is necessary to generate the quantization tables.
+    VP8_CONFIG vp8_config;
+    memset(&vp8_config, 0, sizeof(vp8_config));
+
+    vp8_comp_ = vp8_create_compressor(&vp8_config);
+
+    // Set the tables based on a quantizer of 0.
+    vp8_set_quantizer(vp8_comp_, 0);
+
+    // Set up all the block/blockd pointers for the mb in vp8_comp_.
+    vp8cx_frame_init_quantizer(vp8_comp_);
+
+    // Copy macroblockd from the reference to get pre-set-up dequant values.
+    macroblockd_dst_ = reinterpret_cast<MACROBLOCKD *>(
+        vpx_memalign(32, sizeof(*macroblockd_dst_)));
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    // Fix block pointers - currently they point to the blocks in the reference
+    // structure.
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void UpdateQuantizer(int q) {
+    vp8_set_quantizer(vp8_comp_, q);
+
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void FillCoeffConstant(int16_t c) {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = rnd_.Rand8();
+    }
+  }
+
+  void CheckOutput() {
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.qcoeff, macroblockd_dst_->qcoeff,
+                        sizeof(*macroblockd_dst_->qcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "qcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.dqcoeff, macroblockd_dst_->dqcoeff,
+                        sizeof(*macroblockd_dst_->dqcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "dqcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.eobs, macroblockd_dst_->eobs,
+                        sizeof(*macroblockd_dst_->eobs) * kNumBlocks))
+        << "eobs mismatch";
+  }
+
+  VP8_COMP *vp8_comp_;
+  MACROBLOCKD *macroblockd_dst_;
+
+ private:
+  ACMRandom rnd_;
+};
+
+class QuantizeTest : public QuantizeTestBase,
+                     public ::testing::TestWithParam<VP8QuantizeParam> {
+ protected:
+  virtual void SetUp() {
+    SetupCompressor();
+    asm_quant_ = GET_PARAM(0);
+    c_quant_ = GET_PARAM(1);
+  }
+
+  void RunComparison() {
+    for (int i = 0; i < kNumBlocks; ++i) {
+      ASM_REGISTER_STATE_CHECK(
+          c_quant_(&vp8_comp_->mb.block[i], &vp8_comp_->mb.e_mbd.block[i]));
+      ASM_REGISTER_STATE_CHECK(
+          asm_quant_(&vp8_comp_->mb.block[i], &macroblockd_dst_->block[i]));
+    }
+
+    CheckOutput();
+  }
+
+ private:
+  VP8Quantize asm_quant_;
+  VP8Quantize c_quant_;
+};
+
+TEST_P(QuantizeTest, TestZeroInput) {
+  FillCoeffConstant(0);
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestLargeNegativeInput) {
+  FillCoeffConstant(0);
+  // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+  // like BUG=883 where the constant being compared was incorrectly initialized.
+  vp8_comp_->mb.coeff[0] = -8191;
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestRandomInput) {
+  FillCoeffRandom();
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestMultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    UpdateQuantizer(q);
+    FillCoeffRandom();
+    RunComparison();
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_sse2, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MSA
+}  // namespace

diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index 8d4beea..8e72f91 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h

@@ -96,7 +96,7 @@
 
 extern "C" {
 // Save the d8-d15 registers into store.
-void vp9_push_neon(int64_t *store);
+void vpx_push_neon(int64_t *store);
 }
 
 namespace libvpx_test {
@@ -111,7 +111,7 @@
 
  private:
   static bool StoreRegisters(int64_t store[8]) {
-    vp9_push_neon(store);
+    vpx_push_neon(store);
     return true;
   }
 
@@ -119,7 +119,7 @@
   bool Check() const {
     if (!initialized_) return false;
     int64_t post_store[8];
-    vp9_push_neon(post_store);
+    vpx_push_neon(post_store);
     for (int i = 0; i < 8; ++i) {
       EXPECT_EQ(pre_store_[i], post_store[i]) << "d"
           << i + 8 << " has been modified";

diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index 8d08f1e..f1134aa 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc

@@ -144,6 +144,7 @@
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
@@ -153,9 +154,9 @@
     const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
 
     EXPECT_EQ(expected_w, info->w)
-        << "Frame " << frame << "had unexpected width";
+        << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
-        << "Frame " << frame << "had unexpected height";
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -211,8 +212,8 @@
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
 #if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -222,8 +223,8 @@
     // Write frame header and data.
     write_ivf_frame_header(pkt, outfile_);
     (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
   }
+#endif
 
   double frame0_psnr_;
 #if WRITE_COMPRESSED_STREAM
@@ -260,7 +261,116 @@
   }
 }
 
+vpx_img_fmt_t CspForFrameNumber(int frame) {
+  if (frame < 10)
+    return VPX_IMG_FMT_I420;
+  if (frame < 20)
+    return VPX_IMG_FMT_I444;
+  return VPX_IMG_FMT_I420;
+}
+
+class ResizeCspTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeCspTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
+  ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeCspTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
+        cfg_.g_profile != 1) {
+      cfg_.g_profile = 1;
+      encoder->Config(&cfg_);
+    }
+    if (CspForFrameNumber(video->frame()) == VPX_IMG_FMT_I420 &&
+        cfg_.g_profile != 0) {
+      cfg_.g_profile = 0;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (!frame0_psnr_)
+      frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  ResizingCspVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 30;
+  }
+
+  virtual ~ResizingCspVideoSource() {}
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    SetImageFormat(CspForFrameNumber(frame_));
+    FillFrame();
+  }
+};
+
+TEST_P(ResizeCspTest, TestResizeCspWorks) {
+  ResizingCspVideoSource video;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ResizeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                           ::testing::Values(::libvpx_test::kOnePassBest));
+VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace

diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index e63770b..e6a5e0b 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc

@@ -13,64 +13,74 @@
 #include <limits.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#if CONFIG_VP8_ENCODER
-#include "./vp8_rtcd.h"
-#endif
-#if CONFIG_VP9_ENCODER
-#include "./vp9_rtcd.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
+typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr,
+                                   int src_stride,
+                                   const uint8_t *ref_ptr,
+                                   int ref_stride);
+typedef std::tr1::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
-#if CONFIG_VP8_ENCODER
-typedef unsigned int (*SadMxNFunc)(const unsigned char *source_ptr,
-                                   int source_stride,
-                                   const unsigned char *reference_ptr,
-                                   int reference_stride,
-                                   unsigned int max_sad);
-typedef std::tr1::tuple<int, int, SadMxNFunc> SadMxNParam;
-#endif
-#if CONFIG_VP9_ENCODER
-typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr,
-                                      int source_stride,
-                                      const unsigned char *reference_ptr,
-                                      int reference_stride);
-typedef std::tr1::tuple<int, int, SadMxNVp9Func> SadMxNVp9Param;
-#endif
+typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr,
+                                  int src_stride,
+                                  const uint8_t *ref_ptr,
+                                  int ref_stride,
+                                  const uint8_t *second_pred);
+typedef std::tr1::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
 
 typedef void (*SadMxNx4Func)(const uint8_t *src_ptr,
                              int src_stride,
-                             const unsigned char *const ref_ptr[],
+                             const uint8_t *const ref_ptr[],
                              int ref_stride,
-                             unsigned int *sad_array);
-typedef std::tr1::tuple<int, int, SadMxNx4Func> SadMxNx4Param;
+                             uint32_t *sad_array);
+typedef std::tr1::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
 using libvpx_test::ACMRandom;
 
 namespace {
 class SADTestBase : public ::testing::Test {
  public:
-  SADTestBase(int width, int height) : width_(width), height_(height) {}
+  SADTestBase(int width, int height, int bit_depth) :
+      width_(width), height_(height), bd_(bit_depth) {}
 
   static void SetUpTestCase() {
-    source_data_ = reinterpret_cast<uint8_t*>(
+    source_data8_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
-    reference_data_ = reinterpret_cast<uint8_t*>(
+    reference_data8_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
+    second_pred8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, 64*64));
+    source_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
+    reference_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
+    second_pred16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
-    vpx_free(source_data_);
-    source_data_ = NULL;
-    vpx_free(reference_data_);
-    reference_data_ = NULL;
+    vpx_free(source_data8_);
+    source_data8_ = NULL;
+    vpx_free(reference_data8_);
+    reference_data8_ = NULL;
+    vpx_free(second_pred8_);
+    second_pred8_ = NULL;
+    vpx_free(source_data16_);
+    source_data16_ = NULL;
+    vpx_free(reference_data16_);
+    reference_data16_ = NULL;
+    vpx_free(second_pred16_);
+    second_pred16_ = NULL;
   }
 
   virtual void TearDown() {
@@ -84,53 +94,146 @@
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = VPX_BITS_8;
+      source_data_ = source_data8_;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<vpx_bit_depth_t>(bd_);
+      source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    mask_ = (1 << bit_depth_) - 1;
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual uint8_t* GetReference(int block_idx) {
+  virtual uint8_t *GetReference(int block_idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return reference_data_ + block_idx * kDataBlockSize;
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
   // difference between two pixels in the same relative location; accumulate.
-  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) {
+  unsigned int ReferenceSAD(int block_idx) {
     unsigned int sad = 0;
-    const uint8_t* const reference = GetReference(block_idx);
-
+      const uint8_t *const reference8 = GetReference(block_idx);
+      const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint16_t *const reference16 =
+          CONVERT_TO_SHORTPTR(GetReference(block_idx));
+      const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
-        sad += abs(source_data_[h * source_stride_ + w]
-               - reference[h * reference_stride_ + w]);
-      }
-      if (sad > max_sad) {
-        break;
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
       }
     }
     return sad;
   }
 
-  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+  // Sum of Absolute Differences Average. Given two blocks, and a prediction
+  // calculate the absolute difference between one pixel and average of the
+  // corresponding and predicted pixels; accumulate.
+  unsigned int ReferenceSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
-        data[h * stride + w] = fill_constant;
+        if (!use_high_bit_depth_) {
+          const int tmp = second_pred8[h * width_ + w] +
+              reference8[h * reference_stride_ + w];
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          const int tmp = second_pred16[h * width_ + w] +
+              reference16[h * reference_stride_ + w];
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          data16[h * stride + w] = fill_constant;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
       }
     }
   }
 
   void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
-        data[h * stride + w] = rnd_.Rand8();
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
       }
     }
   }
 
-  int width_, height_;
-  static uint8_t* source_data_;
+  int width_, height_, mask_, bd_;
+  vpx_bit_depth_t bit_depth_;
+  static uint8_t *source_data_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
   int source_stride_;
-  static uint8_t* reference_data_;
+  bool use_high_bit_depth_;
+  static uint8_t *source_data8_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *source_data16_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
   int reference_stride_;
 
   ACMRandom rnd_;
@@ -140,15 +243,15 @@
     : public SADTestBase,
       public ::testing::WithParamInterface<SadMxNx4Param> {
  public:
-  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   void SADs(unsigned int *results) {
-    const uint8_t* refs[] = {GetReference(0), GetReference(1),
-                             GetReference(2), GetReference(3)};
+    const uint8_t *references[] = {GetReference(0), GetReference(1),
+                                   GetReference(2), GetReference(3)};
 
     ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
-                                          refs, reference_stride_,
+                                          references, reference_stride_,
                                           results));
   }
 
@@ -157,56 +260,23 @@
 
     SADs(exp_sad);
     for (int block = 0; block < 4; ++block) {
-      reference_sad = ReferenceSAD(UINT_MAX, block);
+      reference_sad = ReferenceSAD(block);
 
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
 };
 
-#if CONFIG_VP8_ENCODER
 class SADTest
     : public SADTestBase,
       public ::testing::WithParamInterface<SadMxNParam> {
  public:
-  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
-
- protected:
-  unsigned int SAD(unsigned int max_sad, int block_idx) {
-    unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
-
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                                reference, reference_stride_,
-                                                max_sad));
-    return ret;
-  }
-
-  void CheckSAD(unsigned int max_sad) {
-    const unsigned int reference_sad = ReferenceSAD(max_sad, 0);
-    const unsigned int exp_sad = SAD(max_sad, 0);
-
-    if (reference_sad <= max_sad) {
-      ASSERT_EQ(exp_sad, reference_sad);
-    } else {
-      // Alternative implementations are not required to check max_sad
-      ASSERT_GE(exp_sad, reference_sad);
-    }
-  }
-};
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-class SADVP9Test
-    : public SADTestBase,
-      public ::testing::WithParamInterface<SadMxNVp9Param> {
- public:
-  SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   unsigned int SAD(int block_idx) {
     unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
+    const uint8_t *const reference = GetReference(block_idx);
 
     ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_));
@@ -214,36 +284,66 @@
   }
 
   void CheckSAD() {
-    const unsigned int reference_sad = ReferenceSAD(UINT_MAX, 0);
+    const unsigned int reference_sad = ReferenceSAD(0);
     const unsigned int exp_sad = SAD(0);
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
 };
-#endif  // CONFIG_VP9_ENCODER
 
-uint8_t* SADTestBase::source_data_ = NULL;
-uint8_t* SADTestBase::reference_data_ = NULL;
+class SADavgTest
+    : public SADTestBase,
+      public ::testing::WithParamInterface<SadMxNAvgParam> {
+ public:
+  SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
-#if CONFIG_VP8_ENCODER
+ protected:
+  unsigned int SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADavg(0);
+    const unsigned int exp_sad = SAD_avg(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(reference_data_, reference_stride_, 255);
-  CheckSAD(UINT_MAX);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
 }
 
 TEST_P(SADTest, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
-  CheckSAD(UINT_MAX);
+  CheckSAD();
 }
 
 TEST_P(SADTest, ShortRef) {
-  int tmp_stride = reference_stride_;
+  const int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
+  CheckSAD();
   reference_stride_ = tmp_stride;
 }
 
@@ -254,7 +354,7 @@
   reference_stride_ -= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
+  CheckSAD();
   reference_stride_ = tmp_stride;
 }
 
@@ -263,73 +363,66 @@
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
+  CheckSAD();
   source_stride_ = tmp_stride;
 }
 
-TEST_P(SADTest, MaxSAD) {
-  // Verify that, when max_sad is set, the implementation does not return a
-  // value lower than the reference.
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSAD(128);
-}
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-TEST_P(SADVP9Test, MaxRef) {
+TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(reference_data_, reference_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
   CheckSAD();
 }
-
-TEST_P(SADVP9Test, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+TEST_P(SADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
   CheckSAD();
 }
 
-TEST_P(SADVP9Test, ShortRef) {
+TEST_P(SADavgTest, ShortRef) {
   const int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
   CheckSAD();
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADVP9Test, UnalignedRef) {
+TEST_P(SADavgTest, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
   const int tmp_stride = reference_stride_;
   reference_stride_ -= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
   CheckSAD();
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADVP9Test, ShortSrc) {
+TEST_P(SADavgTest, ShortSrc) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
   FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
   CheckSAD();
   source_stride_ = tmp_stride;
 }
-#endif  // CONFIG_VP9_ENCODER
 
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(GetReference(0), reference_stride_, 255);
-  FillConstant(GetReference(1), reference_stride_, 255);
-  FillConstant(GetReference(2), reference_stride_, 255);
-  FillConstant(GetReference(3), reference_stride_, 255);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
   CheckSADs();
 }
 
 TEST_P(SADx4Test, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(GetReference(0), reference_stride_, 0);
   FillConstant(GetReference(1), reference_stride_, 0);
   FillConstant(GetReference(2), reference_stride_, 0);
@@ -375,277 +468,743 @@
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+  uint8_t * tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
 using std::tr1::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_c = vp8_sad16x16_c;
-const SadMxNFunc sad_8x16_c = vp8_sad8x16_c;
-const SadMxNFunc sad_16x8_c = vp8_sad16x8_c;
-const SadMxNFunc sad_8x8_c = vp8_sad8x8_c;
-const SadMxNFunc sad_4x4_c = vp8_sad4x4_c;
+const SadMxNFunc sad64x64_c = vpx_sad64x64_c;
+const SadMxNFunc sad64x32_c = vpx_sad64x32_c;
+const SadMxNFunc sad32x64_c = vpx_sad32x64_c;
+const SadMxNFunc sad32x32_c = vpx_sad32x32_c;
+const SadMxNFunc sad32x16_c = vpx_sad32x16_c;
+const SadMxNFunc sad16x32_c = vpx_sad16x32_c;
+const SadMxNFunc sad16x16_c = vpx_sad16x16_c;
+const SadMxNFunc sad16x8_c = vpx_sad16x8_c;
+const SadMxNFunc sad8x16_c = vpx_sad8x16_c;
+const SadMxNFunc sad8x8_c = vpx_sad8x8_c;
+const SadMxNFunc sad8x4_c = vpx_sad8x4_c;
+const SadMxNFunc sad4x8_c = vpx_sad4x8_c;
+const SadMxNFunc sad4x4_c = vpx_sad4x4_c;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNFunc highbd_sad64x64_c = vpx_highbd_sad64x64_c;
+const SadMxNFunc highbd_sad64x32_c = vpx_highbd_sad64x32_c;
+const SadMxNFunc highbd_sad32x64_c = vpx_highbd_sad32x64_c;
+const SadMxNFunc highbd_sad32x32_c = vpx_highbd_sad32x32_c;
+const SadMxNFunc highbd_sad32x16_c = vpx_highbd_sad32x16_c;
+const SadMxNFunc highbd_sad16x32_c = vpx_highbd_sad16x32_c;
+const SadMxNFunc highbd_sad16x16_c = vpx_highbd_sad16x16_c;
+const SadMxNFunc highbd_sad16x8_c = vpx_highbd_sad16x8_c;
+const SadMxNFunc highbd_sad8x16_c = vpx_highbd_sad8x16_c;
+const SadMxNFunc highbd_sad8x8_c = vpx_highbd_sad8x8_c;
+const SadMxNFunc highbd_sad8x4_c = vpx_highbd_sad8x4_c;
+const SadMxNFunc highbd_sad4x8_c = vpx_highbd_sad4x8_c;
+const SadMxNFunc highbd_sad4x4_c = vpx_highbd_sad4x4_c;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNParam c_tests[] = {
-  make_tuple(16, 16, sad_16x16_c),
-  make_tuple(8, 16, sad_8x16_c),
-  make_tuple(16, 8, sad_16x8_c),
-  make_tuple(8, 8, sad_8x8_c),
-  make_tuple(4, 4, sad_4x4_c),
+  make_tuple(64, 64, sad64x64_c, -1),
+  make_tuple(64, 32, sad64x32_c, -1),
+  make_tuple(32, 64, sad32x64_c, -1),
+  make_tuple(32, 32, sad32x32_c, -1),
+  make_tuple(32, 16, sad32x16_c, -1),
+  make_tuple(16, 32, sad16x32_c, -1),
+  make_tuple(16, 16, sad16x16_c, -1),
+  make_tuple(16, 8, sad16x8_c, -1),
+  make_tuple(8, 16, sad8x16_c, -1),
+  make_tuple(8, 8, sad8x8_c, -1),
+  make_tuple(8, 4, sad8x4_c, -1),
+  make_tuple(4, 8, sad4x8_c, -1),
+  make_tuple(4, 4, sad4x4_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64_c, 8),
+  make_tuple(64, 32, highbd_sad64x32_c, 8),
+  make_tuple(32, 64, highbd_sad32x64_c, 8),
+  make_tuple(32, 32, highbd_sad32x32_c, 8),
+  make_tuple(32, 16, highbd_sad32x16_c, 8),
+  make_tuple(16, 32, highbd_sad16x32_c, 8),
+  make_tuple(16, 16, highbd_sad16x16_c, 8),
+  make_tuple(16, 8, highbd_sad16x8_c, 8),
+  make_tuple(8, 16, highbd_sad8x16_c, 8),
+  make_tuple(8, 8, highbd_sad8x8_c, 8),
+  make_tuple(8, 4, highbd_sad8x4_c, 8),
+  make_tuple(4, 8, highbd_sad4x8_c, 8),
+  make_tuple(4, 4, highbd_sad4x4_c, 8),
+  make_tuple(64, 64, highbd_sad64x64_c, 10),
+  make_tuple(64, 32, highbd_sad64x32_c, 10),
+  make_tuple(32, 64, highbd_sad32x64_c, 10),
+  make_tuple(32, 32, highbd_sad32x32_c, 10),
+  make_tuple(32, 16, highbd_sad32x16_c, 10),
+  make_tuple(16, 32, highbd_sad16x32_c, 10),
+  make_tuple(16, 16, highbd_sad16x16_c, 10),
+  make_tuple(16, 8, highbd_sad16x8_c, 10),
+  make_tuple(8, 16, highbd_sad8x16_c, 10),
+  make_tuple(8, 8, highbd_sad8x8_c, 10),
+  make_tuple(8, 4, highbd_sad8x4_c, 10),
+  make_tuple(4, 8, highbd_sad4x8_c, 10),
+  make_tuple(4, 4, highbd_sad4x4_c, 10),
+  make_tuple(64, 64, highbd_sad64x64_c, 12),
+  make_tuple(64, 32, highbd_sad64x32_c, 12),
+  make_tuple(32, 64, highbd_sad32x64_c, 12),
+  make_tuple(32, 32, highbd_sad32x32_c, 12),
+  make_tuple(32, 16, highbd_sad32x16_c, 12),
+  make_tuple(16, 32, highbd_sad16x32_c, 12),
+  make_tuple(16, 16, highbd_sad16x16_c, 12),
+  make_tuple(16, 8, highbd_sad16x8_c, 12),
+  make_tuple(8, 16, highbd_sad8x16_c, 12),
+  make_tuple(8, 8, highbd_sad8x8_c, 12),
+  make_tuple(8, 4, highbd_sad8x4_c, 12),
+  make_tuple(4, 8, highbd_sad4x8_c, 12),
+  make_tuple(4, 4, highbd_sad4x4_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
-#endif  // CONFIG_VP8_ENCODER
 
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_64x64_c_vp9 = vp9_sad64x64_c;
-const SadMxNVp9Func sad_32x32_c_vp9 = vp9_sad32x32_c;
-const SadMxNVp9Func sad_16x16_c_vp9 = vp9_sad16x16_c;
-const SadMxNVp9Func sad_8x16_c_vp9 = vp9_sad8x16_c;
-const SadMxNVp9Func sad_16x8_c_vp9 = vp9_sad16x8_c;
-const SadMxNVp9Func sad_8x8_c_vp9 = vp9_sad8x8_c;
-const SadMxNVp9Func sad_8x4_c_vp9 = vp9_sad8x4_c;
-const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c;
-const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c;
-const SadMxNVp9Param c_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_c_vp9),
-  make_tuple(32, 32, sad_32x32_c_vp9),
-  make_tuple(16, 16, sad_16x16_c_vp9),
-  make_tuple(8, 16, sad_8x16_c_vp9),
-  make_tuple(16, 8, sad_16x8_c_vp9),
-  make_tuple(8, 8, sad_8x8_c_vp9),
-  make_tuple(8, 4, sad_8x4_c_vp9),
-  make_tuple(4, 8, sad_4x8_c_vp9),
-  make_tuple(4, 4, sad_4x4_c_vp9),
+const SadMxNAvgFunc sad64x64_avg_c = vpx_sad64x64_avg_c;
+const SadMxNAvgFunc sad64x32_avg_c = vpx_sad64x32_avg_c;
+const SadMxNAvgFunc sad32x64_avg_c = vpx_sad32x64_avg_c;
+const SadMxNAvgFunc sad32x32_avg_c = vpx_sad32x32_avg_c;
+const SadMxNAvgFunc sad32x16_avg_c = vpx_sad32x16_avg_c;
+const SadMxNAvgFunc sad16x32_avg_c = vpx_sad16x32_avg_c;
+const SadMxNAvgFunc sad16x16_avg_c = vpx_sad16x16_avg_c;
+const SadMxNAvgFunc sad16x8_avg_c = vpx_sad16x8_avg_c;
+const SadMxNAvgFunc sad8x16_avg_c = vpx_sad8x16_avg_c;
+const SadMxNAvgFunc sad8x8_avg_c = vpx_sad8x8_avg_c;
+const SadMxNAvgFunc sad8x4_avg_c = vpx_sad8x4_avg_c;
+const SadMxNAvgFunc sad4x8_avg_c = vpx_sad4x8_avg_c;
+const SadMxNAvgFunc sad4x4_avg_c = vpx_sad4x4_avg_c;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNAvgFunc highbd_sad64x64_avg_c = vpx_highbd_sad64x64_avg_c;
+const SadMxNAvgFunc highbd_sad64x32_avg_c = vpx_highbd_sad64x32_avg_c;
+const SadMxNAvgFunc highbd_sad32x64_avg_c = vpx_highbd_sad32x64_avg_c;
+const SadMxNAvgFunc highbd_sad32x32_avg_c = vpx_highbd_sad32x32_avg_c;
+const SadMxNAvgFunc highbd_sad32x16_avg_c = vpx_highbd_sad32x16_avg_c;
+const SadMxNAvgFunc highbd_sad16x32_avg_c = vpx_highbd_sad16x32_avg_c;
+const SadMxNAvgFunc highbd_sad16x16_avg_c = vpx_highbd_sad16x16_avg_c;
+const SadMxNAvgFunc highbd_sad16x8_avg_c = vpx_highbd_sad16x8_avg_c;
+const SadMxNAvgFunc highbd_sad8x16_avg_c = vpx_highbd_sad8x16_avg_c;
+const SadMxNAvgFunc highbd_sad8x8_avg_c = vpx_highbd_sad8x8_avg_c;
+const SadMxNAvgFunc highbd_sad8x4_avg_c = vpx_highbd_sad8x4_avg_c;
+const SadMxNAvgFunc highbd_sad4x8_avg_c = vpx_highbd_sad4x8_avg_c;
+const SadMxNAvgFunc highbd_sad4x4_avg_c = vpx_highbd_sad4x4_avg_c;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+const SadMxNAvgParam avg_c_tests[] = {
+  make_tuple(64, 64, sad64x64_avg_c, -1),
+  make_tuple(64, 32, sad64x32_avg_c, -1),
+  make_tuple(32, 64, sad32x64_avg_c, -1),
+  make_tuple(32, 32, sad32x32_avg_c, -1),
+  make_tuple(32, 16, sad32x16_avg_c, -1),
+  make_tuple(16, 32, sad16x32_avg_c, -1),
+  make_tuple(16, 16, sad16x16_avg_c, -1),
+  make_tuple(16, 8, sad16x8_avg_c, -1),
+  make_tuple(8, 16, sad8x16_avg_c, -1),
+  make_tuple(8, 8, sad8x8_avg_c, -1),
+  make_tuple(8, 4, sad8x4_avg_c, -1),
+  make_tuple(4, 8, sad4x8_avg_c, -1),
+  make_tuple(4, 4, sad4x4_avg_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64_avg_c, 8),
+  make_tuple(64, 32, highbd_sad64x32_avg_c, 8),
+  make_tuple(32, 64, highbd_sad32x64_avg_c, 8),
+  make_tuple(32, 32, highbd_sad32x32_avg_c, 8),
+  make_tuple(32, 16, highbd_sad32x16_avg_c, 8),
+  make_tuple(16, 32, highbd_sad16x32_avg_c, 8),
+  make_tuple(16, 16, highbd_sad16x16_avg_c, 8),
+  make_tuple(16, 8, highbd_sad16x8_avg_c, 8),
+  make_tuple(8, 16, highbd_sad8x16_avg_c, 8),
+  make_tuple(8, 8, highbd_sad8x8_avg_c, 8),
+  make_tuple(8, 4, highbd_sad8x4_avg_c, 8),
+  make_tuple(4, 8, highbd_sad4x8_avg_c, 8),
+  make_tuple(4, 4, highbd_sad4x4_avg_c, 8),
+  make_tuple(64, 64, highbd_sad64x64_avg_c, 10),
+  make_tuple(64, 32, highbd_sad64x32_avg_c, 10),
+  make_tuple(32, 64, highbd_sad32x64_avg_c, 10),
+  make_tuple(32, 32, highbd_sad32x32_avg_c, 10),
+  make_tuple(32, 16, highbd_sad32x16_avg_c, 10),
+  make_tuple(16, 32, highbd_sad16x32_avg_c, 10),
+  make_tuple(16, 16, highbd_sad16x16_avg_c, 10),
+  make_tuple(16, 8, highbd_sad16x8_avg_c, 10),
+  make_tuple(8, 16, highbd_sad8x16_avg_c, 10),
+  make_tuple(8, 8, highbd_sad8x8_avg_c, 10),
+  make_tuple(8, 4, highbd_sad8x4_avg_c, 10),
+  make_tuple(4, 8, highbd_sad4x8_avg_c, 10),
+  make_tuple(4, 4, highbd_sad4x4_avg_c, 10),
+  make_tuple(64, 64, highbd_sad64x64_avg_c, 12),
+  make_tuple(64, 32, highbd_sad64x32_avg_c, 12),
+  make_tuple(32, 64, highbd_sad32x64_avg_c, 12),
+  make_tuple(32, 32, highbd_sad32x32_avg_c, 12),
+  make_tuple(32, 16, highbd_sad32x16_avg_c, 12),
+  make_tuple(16, 32, highbd_sad16x32_avg_c, 12),
+  make_tuple(16, 16, highbd_sad16x16_avg_c, 12),
+  make_tuple(16, 8, highbd_sad16x8_avg_c, 12),
+  make_tuple(8, 16, highbd_sad8x16_avg_c, 12),
+  make_tuple(8, 8, highbd_sad8x8_avg_c, 12),
+  make_tuple(8, 4, highbd_sad8x4_avg_c, 12),
+  make_tuple(4, 8, highbd_sad4x8_avg_c, 12),
+  make_tuple(4, 4, highbd_sad4x4_avg_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
+INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
-const SadMxNx4Func sad_64x64x4d_c = vp9_sad64x64x4d_c;
-const SadMxNx4Func sad_64x32x4d_c = vp9_sad64x32x4d_c;
-const SadMxNx4Func sad_32x64x4d_c = vp9_sad32x64x4d_c;
-const SadMxNx4Func sad_32x32x4d_c = vp9_sad32x32x4d_c;
-const SadMxNx4Func sad_32x16x4d_c = vp9_sad32x16x4d_c;
-const SadMxNx4Func sad_16x32x4d_c = vp9_sad16x32x4d_c;
-const SadMxNx4Func sad_16x16x4d_c = vp9_sad16x16x4d_c;
-const SadMxNx4Func sad_16x8x4d_c = vp9_sad16x8x4d_c;
-const SadMxNx4Func sad_8x16x4d_c = vp9_sad8x16x4d_c;
-const SadMxNx4Func sad_8x8x4d_c = vp9_sad8x8x4d_c;
-const SadMxNx4Func sad_8x4x4d_c = vp9_sad8x4x4d_c;
-const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c;
-const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c;
-INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
-                        make_tuple(64, 64, sad_64x64x4d_c),
-                        make_tuple(64, 32, sad_64x32x4d_c),
-                        make_tuple(32, 64, sad_32x64x4d_c),
-                        make_tuple(32, 32, sad_32x32x4d_c),
-                        make_tuple(32, 16, sad_32x16x4d_c),
-                        make_tuple(16, 32, sad_16x32x4d_c),
-                        make_tuple(16, 16, sad_16x16x4d_c),
-                        make_tuple(16, 8, sad_16x8x4d_c),
-                        make_tuple(8, 16, sad_8x16x4d_c),
-                        make_tuple(8, 8, sad_8x8x4d_c),
-                        make_tuple(8, 4, sad_8x4x4d_c),
-                        make_tuple(4, 8, sad_4x8x4d_c),
-                        make_tuple(4, 4, sad_4x4x4d_c)));
-#endif  // CONFIG_VP9_ENCODER
+const SadMxNx4Func sad64x64x4d_c = vpx_sad64x64x4d_c;
+const SadMxNx4Func sad64x32x4d_c = vpx_sad64x32x4d_c;
+const SadMxNx4Func sad32x64x4d_c = vpx_sad32x64x4d_c;
+const SadMxNx4Func sad32x32x4d_c = vpx_sad32x32x4d_c;
+const SadMxNx4Func sad32x16x4d_c = vpx_sad32x16x4d_c;
+const SadMxNx4Func sad16x32x4d_c = vpx_sad16x32x4d_c;
+const SadMxNx4Func sad16x16x4d_c = vpx_sad16x16x4d_c;
+const SadMxNx4Func sad16x8x4d_c = vpx_sad16x8x4d_c;
+const SadMxNx4Func sad8x16x4d_c = vpx_sad8x16x4d_c;
+const SadMxNx4Func sad8x8x4d_c = vpx_sad8x8x4d_c;
+const SadMxNx4Func sad8x4x4d_c = vpx_sad8x4x4d_c;
+const SadMxNx4Func sad4x8x4d_c = vpx_sad4x8x4d_c;
+const SadMxNx4Func sad4x4x4d_c = vpx_sad4x4x4d_c;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNx4Func highbd_sad64x64x4d_c = vpx_highbd_sad64x64x4d_c;
+const SadMxNx4Func highbd_sad64x32x4d_c = vpx_highbd_sad64x32x4d_c;
+const SadMxNx4Func highbd_sad32x64x4d_c = vpx_highbd_sad32x64x4d_c;
+const SadMxNx4Func highbd_sad32x32x4d_c = vpx_highbd_sad32x32x4d_c;
+const SadMxNx4Func highbd_sad32x16x4d_c = vpx_highbd_sad32x16x4d_c;
+const SadMxNx4Func highbd_sad16x32x4d_c = vpx_highbd_sad16x32x4d_c;
+const SadMxNx4Func highbd_sad16x16x4d_c = vpx_highbd_sad16x16x4d_c;
+const SadMxNx4Func highbd_sad16x8x4d_c = vpx_highbd_sad16x8x4d_c;
+const SadMxNx4Func highbd_sad8x16x4d_c = vpx_highbd_sad8x16x4d_c;
+const SadMxNx4Func highbd_sad8x8x4d_c = vpx_highbd_sad8x8x4d_c;
+const SadMxNx4Func highbd_sad8x4x4d_c = vpx_highbd_sad8x4x4d_c;
+const SadMxNx4Func highbd_sad4x8x4d_c = vpx_highbd_sad4x8x4d_c;
+const SadMxNx4Func highbd_sad4x4x4d_c = vpx_highbd_sad4x4x4d_c;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+const SadMxNx4Param x4d_c_tests[] = {
+  make_tuple(64, 64, sad64x64x4d_c, -1),
+  make_tuple(64, 32, sad64x32x4d_c, -1),
+  make_tuple(32, 64, sad32x64x4d_c, -1),
+  make_tuple(32, 32, sad32x32x4d_c, -1),
+  make_tuple(32, 16, sad32x16x4d_c, -1),
+  make_tuple(16, 32, sad16x32x4d_c, -1),
+  make_tuple(16, 16, sad16x16x4d_c, -1),
+  make_tuple(16, 8, sad16x8x4d_c, -1),
+  make_tuple(8, 16, sad8x16x4d_c, -1),
+  make_tuple(8, 8, sad8x8x4d_c, -1),
+  make_tuple(8, 4, sad8x4x4d_c, -1),
+  make_tuple(4, 8, sad4x8x4d_c, -1),
+  make_tuple(4, 4, sad4x4x4d_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64x4d_c, 8),
+  make_tuple(64, 32, highbd_sad64x32x4d_c, 8),
+  make_tuple(32, 64, highbd_sad32x64x4d_c, 8),
+  make_tuple(32, 32, highbd_sad32x32x4d_c, 8),
+  make_tuple(32, 16, highbd_sad32x16x4d_c, 8),
+  make_tuple(16, 32, highbd_sad16x32x4d_c, 8),
+  make_tuple(16, 16, highbd_sad16x16x4d_c, 8),
+  make_tuple(16, 8, highbd_sad16x8x4d_c, 8),
+  make_tuple(8, 16, highbd_sad8x16x4d_c, 8),
+  make_tuple(8, 8, highbd_sad8x8x4d_c, 8),
+  make_tuple(8, 4, highbd_sad8x4x4d_c, 8),
+  make_tuple(4, 8, highbd_sad4x8x4d_c, 8),
+  make_tuple(4, 4, highbd_sad4x4x4d_c, 8),
+  make_tuple(64, 64, highbd_sad64x64x4d_c, 10),
+  make_tuple(64, 32, highbd_sad64x32x4d_c, 10),
+  make_tuple(32, 64, highbd_sad32x64x4d_c, 10),
+  make_tuple(32, 32, highbd_sad32x32x4d_c, 10),
+  make_tuple(32, 16, highbd_sad32x16x4d_c, 10),
+  make_tuple(16, 32, highbd_sad16x32x4d_c, 10),
+  make_tuple(16, 16, highbd_sad16x16x4d_c, 10),
+  make_tuple(16, 8, highbd_sad16x8x4d_c, 10),
+  make_tuple(8, 16, highbd_sad8x16x4d_c, 10),
+  make_tuple(8, 8, highbd_sad8x8x4d_c, 10),
+  make_tuple(8, 4, highbd_sad8x4x4d_c, 10),
+  make_tuple(4, 8, highbd_sad4x8x4d_c, 10),
+  make_tuple(4, 4, highbd_sad4x4x4d_c, 10),
+  make_tuple(64, 64, highbd_sad64x64x4d_c, 12),
+  make_tuple(64, 32, highbd_sad64x32x4d_c, 12),
+  make_tuple(32, 64, highbd_sad32x64x4d_c, 12),
+  make_tuple(32, 32, highbd_sad32x32x4d_c, 12),
+  make_tuple(32, 16, highbd_sad32x16x4d_c, 12),
+  make_tuple(16, 32, highbd_sad16x32x4d_c, 12),
+  make_tuple(16, 16, highbd_sad16x16x4d_c, 12),
+  make_tuple(16, 8, highbd_sad16x8x4d_c, 12),
+  make_tuple(8, 16, highbd_sad8x16x4d_c, 12),
+  make_tuple(8, 8, highbd_sad8x8x4d_c, 12),
+  make_tuple(8, 4, highbd_sad8x4x4d_c, 12),
+  make_tuple(4, 8, highbd_sad4x8x4d_c, 12),
+  make_tuple(4, 4, highbd_sad4x4x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_MEDIA
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6;
-INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_armv6)));
-#endif  // CONFIG_VP8_ENCODER
+const SadMxNFunc sad16x16_media = vpx_sad16x16_media;
+const SadMxNParam media_tests[] = {
+  make_tuple(16, 16, sad16x16_media, -1),
+};
+INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests));
 #endif  // HAVE_MEDIA
 
 #if HAVE_NEON
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_neon = vp8_sad16x16_neon;
-const SadMxNFunc sad_8x16_neon = vp8_sad8x16_neon;
-const SadMxNFunc sad_16x8_neon = vp8_sad16x8_neon;
-const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon;
-const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon;
-INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_neon),
-                        make_tuple(8, 16, sad_8x16_neon),
-                        make_tuple(16, 8, sad_16x8_neon),
-                        make_tuple(8, 8, sad_8x8_neon),
-                        make_tuple(4, 4, sad_4x4_neon)));
-#endif  // CONFIG_VP8_ENCODER
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon;
-const SadMxNVp9Func sad_32x32_neon_vp9 = vp9_sad32x32_neon;
-const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon;
-const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon;
-const SadMxNVp9Param neon_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_neon_vp9),
-  make_tuple(32, 32, sad_32x32_neon_vp9),
-  make_tuple(16, 16, sad_16x16_neon_vp9),
-  make_tuple(8, 8, sad_8x8_neon_vp9),
+const SadMxNFunc sad64x64_neon = vpx_sad64x64_neon;
+const SadMxNFunc sad32x32_neon = vpx_sad32x32_neon;
+const SadMxNFunc sad16x16_neon = vpx_sad16x16_neon;
+const SadMxNFunc sad16x8_neon = vpx_sad16x8_neon;
+const SadMxNFunc sad8x16_neon = vpx_sad8x16_neon;
+const SadMxNFunc sad8x8_neon = vpx_sad8x8_neon;
+const SadMxNFunc sad4x4_neon = vpx_sad4x4_neon;
+
+const SadMxNParam neon_tests[] = {
+  make_tuple(64, 64, sad64x64_neon, -1),
+  make_tuple(32, 32, sad32x32_neon, -1),
+  make_tuple(16, 16, sad16x16_neon, -1),
+  make_tuple(16, 8, sad16x8_neon, -1),
+  make_tuple(8, 16, sad8x16_neon, -1),
+  make_tuple(8, 8, sad8x8_neon, -1),
+  make_tuple(4, 4, sad4x4_neon, -1),
 };
-INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests));
-#endif  // CONFIG_VP9_ENCODER
+INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+const SadMxNx4Func sad64x64x4d_neon = vpx_sad64x64x4d_neon;
+const SadMxNx4Func sad32x32x4d_neon = vpx_sad32x32x4d_neon;
+const SadMxNx4Func sad16x16x4d_neon = vpx_sad16x16x4d_neon;
+const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(64, 64, sad64x64x4d_neon, -1),
+  make_tuple(32, 32, sad32x32x4d_neon, -1),
+  make_tuple(16, 16, sad16x16x4d_neon, -1),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
 // x86 functions
 #if HAVE_MMX
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_mmx = vp8_sad16x16_mmx;
-const SadMxNFunc sad_8x16_mmx = vp8_sad8x16_mmx;
-const SadMxNFunc sad_16x8_mmx = vp8_sad16x8_mmx;
-const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx;
-const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx;
+const SadMxNFunc sad16x16_mmx = vpx_sad16x16_mmx;
+const SadMxNFunc sad16x8_mmx = vpx_sad16x8_mmx;
+const SadMxNFunc sad8x16_mmx = vpx_sad8x16_mmx;
+const SadMxNFunc sad8x8_mmx = vpx_sad8x8_mmx;
+const SadMxNFunc sad4x4_mmx = vpx_sad4x4_mmx;
 const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, sad_16x16_mmx),
-  make_tuple(8, 16, sad_8x16_mmx),
-  make_tuple(16, 8, sad_16x8_mmx),
-  make_tuple(8, 8, sad_8x8_mmx),
-  make_tuple(4, 4, sad_4x4_mmx),
+  make_tuple(16, 16, sad16x16_mmx, -1),
+  make_tuple(16, 8, sad16x8_mmx, -1),
+  make_tuple(8, 16, sad8x16_mmx, -1),
+  make_tuple(8, 8, sad8x8_mmx, -1),
+  make_tuple(4, 4, sad4x4_mmx, -1),
 };
 INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const SadMxNVp9Param mmx_vp9_tests[] = {
-  make_tuple(16, 16, sad_16x16_mmx_vp9),
-  make_tuple(8, 16, sad_8x16_mmx_vp9),
-  make_tuple(16, 8, sad_16x8_mmx_vp9),
-  make_tuple(8, 8, sad_8x8_mmx_vp9),
-  make_tuple(4, 4, sad_4x4_mmx_vp9),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_MMX
 
 #if HAVE_SSE
-#if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse;
-const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse;
-INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
-                        make_tuple(4, 4, sad_4x4_sse_vp9),
-                        make_tuple(4, 8, sad_4x8_sse_vp9)));
+const SadMxNFunc sad4x8_sse = vpx_sad4x8_sse;
+const SadMxNFunc sad4x4_sse = vpx_sad4x4_sse;
+const SadMxNParam sse_tests[] = {
+  make_tuple(4, 8, sad4x8_sse, -1),
+  make_tuple(4, 4, sad4x4_sse, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests));
 
-const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
-const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
-INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
-                        make_tuple(4, 8, sad_4x8x4d_sse),
-                        make_tuple(4, 4, sad_4x4x4d_sse)));
+const SadMxNAvgFunc sad4x8_avg_sse = vpx_sad4x8_avg_sse;
+const SadMxNAvgFunc sad4x4_avg_sse = vpx_sad4x4_avg_sse;
+const SadMxNAvgParam avg_sse_tests[] = {
+  make_tuple(4, 8, sad4x8_avg_sse, -1),
+  make_tuple(4, 4, sad4x4_avg_sse, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests));
+
+const SadMxNx4Func sad4x8x4d_sse = vpx_sad4x8x4d_sse;
+const SadMxNx4Func sad4x4x4d_sse = vpx_sad4x4x4d_sse;
+const SadMxNx4Param x4d_sse_tests[] = {
+  make_tuple(4, 8, sad4x8x4d_sse, -1),
+  make_tuple(4, 4, sad4x4x4d_sse, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::ValuesIn(x4d_sse_tests));
 #endif  // CONFIG_USE_X86INC
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_SSE
 
 #if HAVE_SSE2
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_wmt = vp8_sad16x16_wmt;
-const SadMxNFunc sad_8x16_wmt = vp8_sad8x16_wmt;
-const SadMxNFunc sad_16x8_wmt = vp8_sad16x8_wmt;
-const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt;
-const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt;
+#if CONFIG_USE_X86INC
+const SadMxNFunc sad64x64_sse2 = vpx_sad64x64_sse2;
+const SadMxNFunc sad64x32_sse2 = vpx_sad64x32_sse2;
+const SadMxNFunc sad32x64_sse2 = vpx_sad32x64_sse2;
+const SadMxNFunc sad32x32_sse2 = vpx_sad32x32_sse2;
+const SadMxNFunc sad32x16_sse2 = vpx_sad32x16_sse2;
+const SadMxNFunc sad16x32_sse2 = vpx_sad16x32_sse2;
+const SadMxNFunc sad16x16_sse2 = vpx_sad16x16_sse2;
+const SadMxNFunc sad16x8_sse2 = vpx_sad16x8_sse2;
+const SadMxNFunc sad8x16_sse2 = vpx_sad8x16_sse2;
+const SadMxNFunc sad8x8_sse2 = vpx_sad8x8_sse2;
+const SadMxNFunc sad8x4_sse2 = vpx_sad8x4_sse2;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNFunc highbd_sad64x64_sse2 = vpx_highbd_sad64x64_sse2;
+const SadMxNFunc highbd_sad64x32_sse2 = vpx_highbd_sad64x32_sse2;
+const SadMxNFunc highbd_sad32x64_sse2 = vpx_highbd_sad32x64_sse2;
+const SadMxNFunc highbd_sad32x32_sse2 = vpx_highbd_sad32x32_sse2;
+const SadMxNFunc highbd_sad32x16_sse2 = vpx_highbd_sad32x16_sse2;
+const SadMxNFunc highbd_sad16x32_sse2 = vpx_highbd_sad16x32_sse2;
+const SadMxNFunc highbd_sad16x16_sse2 = vpx_highbd_sad16x16_sse2;
+const SadMxNFunc highbd_sad16x8_sse2 = vpx_highbd_sad16x8_sse2;
+const SadMxNFunc highbd_sad8x16_sse2 = vpx_highbd_sad8x16_sse2;
+const SadMxNFunc highbd_sad8x8_sse2 = vpx_highbd_sad8x8_sse2;
+const SadMxNFunc highbd_sad8x4_sse2 = vpx_highbd_sad8x4_sse2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNParam sse2_tests[] = {
-  make_tuple(16, 16, sad_16x16_wmt),
-  make_tuple(8, 16, sad_8x16_wmt),
-  make_tuple(16, 8, sad_16x8_wmt),
-  make_tuple(8, 8, sad_8x8_wmt),
-  make_tuple(4, 4, sad_4x4_wmt),
+  make_tuple(64, 64, sad64x64_sse2, -1),
+  make_tuple(64, 32, sad64x32_sse2, -1),
+  make_tuple(32, 64, sad32x64_sse2, -1),
+  make_tuple(32, 32, sad32x32_sse2, -1),
+  make_tuple(32, 16, sad32x16_sse2, -1),
+  make_tuple(16, 32, sad16x32_sse2, -1),
+  make_tuple(16, 16, sad16x16_sse2, -1),
+  make_tuple(16, 8, sad16x8_sse2, -1),
+  make_tuple(8, 16, sad8x16_sse2, -1),
+  make_tuple(8, 8, sad8x8_sse2, -1),
+  make_tuple(8, 4, sad8x4_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64_sse2, 8),
+  make_tuple(64, 32, highbd_sad64x32_sse2, 8),
+  make_tuple(32, 64, highbd_sad32x64_sse2, 8),
+  make_tuple(32, 32, highbd_sad32x32_sse2, 8),
+  make_tuple(32, 16, highbd_sad32x16_sse2, 8),
+  make_tuple(16, 32, highbd_sad16x32_sse2, 8),
+  make_tuple(16, 16, highbd_sad16x16_sse2, 8),
+  make_tuple(16, 8, highbd_sad16x8_sse2, 8),
+  make_tuple(8, 16, highbd_sad8x16_sse2, 8),
+  make_tuple(8, 8, highbd_sad8x8_sse2, 8),
+  make_tuple(8, 4, highbd_sad8x4_sse2, 8),
+  make_tuple(64, 64, highbd_sad64x64_sse2, 10),
+  make_tuple(64, 32, highbd_sad64x32_sse2, 10),
+  make_tuple(32, 64, highbd_sad32x64_sse2, 10),
+  make_tuple(32, 32, highbd_sad32x32_sse2, 10),
+  make_tuple(32, 16, highbd_sad32x16_sse2, 10),
+  make_tuple(16, 32, highbd_sad16x32_sse2, 10),
+  make_tuple(16, 16, highbd_sad16x16_sse2, 10),
+  make_tuple(16, 8, highbd_sad16x8_sse2, 10),
+  make_tuple(8, 16, highbd_sad8x16_sse2, 10),
+  make_tuple(8, 8, highbd_sad8x8_sse2, 10),
+  make_tuple(8, 4, highbd_sad8x4_sse2, 10),
+  make_tuple(64, 64, highbd_sad64x64_sse2, 12),
+  make_tuple(64, 32, highbd_sad64x32_sse2, 12),
+  make_tuple(32, 64, highbd_sad32x64_sse2, 12),
+  make_tuple(32, 32, highbd_sad32x32_sse2, 12),
+  make_tuple(32, 16, highbd_sad32x16_sse2, 12),
+  make_tuple(16, 32, highbd_sad16x32_sse2, 12),
+  make_tuple(16, 16, highbd_sad16x16_sse2, 12),
+  make_tuple(16, 8, highbd_sad16x8_sse2, 12),
+  make_tuple(8, 16, highbd_sad8x16_sse2, 12),
+  make_tuple(8, 8, highbd_sad8x8_sse2, 12),
+  make_tuple(8, 4, highbd_sad8x4_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
-#endif  // CONFIG_VP8_ENCODER
 
-#if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
-const SadMxNVp9Func sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const SadMxNVp9Func sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const SadMxNVp9Func sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
-const SadMxNVp9Func sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const SadMxNVp9Func sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const SadMxNVp9Func sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
-const SadMxNVp9Func sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const SadMxNVp9Func sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
-const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
-const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
-const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-const SadMxNVp9Param sse2_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_sse2_vp9),
-  make_tuple(64, 32, sad_64x32_sse2_vp9),
-  make_tuple(32, 64, sad_32x64_sse2_vp9),
-  make_tuple(32, 32, sad_32x32_sse2_vp9),
-  make_tuple(32, 16, sad_32x16_sse2_vp9),
-  make_tuple(16, 32, sad_16x32_sse2_vp9),
-  make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(16, 8, sad_16x8_sse2_vp9),
-  make_tuple(8, 16, sad_8x16_sse2_vp9),
-  make_tuple(8, 8, sad_8x8_sse2_vp9),
-  make_tuple(8, 4, sad_8x4_sse2_vp9),
+const SadMxNAvgFunc sad64x64_avg_sse2 = vpx_sad64x64_avg_sse2;
+const SadMxNAvgFunc sad64x32_avg_sse2 = vpx_sad64x32_avg_sse2;
+const SadMxNAvgFunc sad32x64_avg_sse2 = vpx_sad32x64_avg_sse2;
+const SadMxNAvgFunc sad32x32_avg_sse2 = vpx_sad32x32_avg_sse2;
+const SadMxNAvgFunc sad32x16_avg_sse2 = vpx_sad32x16_avg_sse2;
+const SadMxNAvgFunc sad16x32_avg_sse2 = vpx_sad16x32_avg_sse2;
+const SadMxNAvgFunc sad16x16_avg_sse2 = vpx_sad16x16_avg_sse2;
+const SadMxNAvgFunc sad16x8_avg_sse2 = vpx_sad16x8_avg_sse2;
+const SadMxNAvgFunc sad8x16_avg_sse2 = vpx_sad8x16_avg_sse2;
+const SadMxNAvgFunc sad8x8_avg_sse2 = vpx_sad8x8_avg_sse2;
+const SadMxNAvgFunc sad8x4_avg_sse2 = vpx_sad8x4_avg_sse2;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNAvgFunc highbd_sad64x64_avg_sse2 = vpx_highbd_sad64x64_avg_sse2;
+const SadMxNAvgFunc highbd_sad64x32_avg_sse2 = vpx_highbd_sad64x32_avg_sse2;
+const SadMxNAvgFunc highbd_sad32x64_avg_sse2 = vpx_highbd_sad32x64_avg_sse2;
+const SadMxNAvgFunc highbd_sad32x32_avg_sse2 = vpx_highbd_sad32x32_avg_sse2;
+const SadMxNAvgFunc highbd_sad32x16_avg_sse2 = vpx_highbd_sad32x16_avg_sse2;
+const SadMxNAvgFunc highbd_sad16x32_avg_sse2 = vpx_highbd_sad16x32_avg_sse2;
+const SadMxNAvgFunc highbd_sad16x16_avg_sse2 = vpx_highbd_sad16x16_avg_sse2;
+const SadMxNAvgFunc highbd_sad16x8_avg_sse2 = vpx_highbd_sad16x8_avg_sse2;
+const SadMxNAvgFunc highbd_sad8x16_avg_sse2 = vpx_highbd_sad8x16_avg_sse2;
+const SadMxNAvgFunc highbd_sad8x8_avg_sse2 = vpx_highbd_sad8x8_avg_sse2;
+const SadMxNAvgFunc highbd_sad8x4_avg_sse2 = vpx_highbd_sad8x4_avg_sse2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+const SadMxNAvgParam avg_sse2_tests[] = {
+  make_tuple(64, 64, sad64x64_avg_sse2, -1),
+  make_tuple(64, 32, sad64x32_avg_sse2, -1),
+  make_tuple(32, 64, sad32x64_avg_sse2, -1),
+  make_tuple(32, 32, sad32x32_avg_sse2, -1),
+  make_tuple(32, 16, sad32x16_avg_sse2, -1),
+  make_tuple(16, 32, sad16x32_avg_sse2, -1),
+  make_tuple(16, 16, sad16x16_avg_sse2, -1),
+  make_tuple(16, 8, sad16x8_avg_sse2, -1),
+  make_tuple(8, 16, sad8x16_avg_sse2, -1),
+  make_tuple(8, 8, sad8x8_avg_sse2, -1),
+  make_tuple(8, 4, sad8x4_avg_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 8),
+  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 8),
+  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 8),
+  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 8),
+  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 8),
+  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 8),
+  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 8),
+  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 8),
+  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 8),
+  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 8),
+  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 8),
+  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 10),
+  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 10),
+  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 10),
+  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 10),
+  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 10),
+  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 10),
+  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 10),
+  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 10),
+  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 10),
+  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 10),
+  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 10),
+  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 12),
+  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 12),
+  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 12),
+  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 12),
+  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 12),
+  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 12),
+  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 12),
+  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 12),
+  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 12),
+  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 12),
+  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
+INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
-const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
-const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
-const SadMxNx4Func sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
-const SadMxNx4Func sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
-const SadMxNx4Func sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
-const SadMxNx4Func sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
-const SadMxNx4Func sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
-const SadMxNx4Func sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
-const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
-const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
-const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
-INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
-                        make_tuple(64, 64, sad_64x64x4d_sse2),
-                        make_tuple(64, 32, sad_64x32x4d_sse2),
-                        make_tuple(32, 64, sad_32x64x4d_sse2),
-                        make_tuple(32, 32, sad_32x32x4d_sse2),
-                        make_tuple(32, 16, sad_32x16x4d_sse2),
-                        make_tuple(16, 32, sad_16x32x4d_sse2),
-                        make_tuple(16, 16, sad_16x16x4d_sse2),
-                        make_tuple(16, 8, sad_16x8x4d_sse2),
-                        make_tuple(8, 16, sad_8x16x4d_sse2),
-                        make_tuple(8, 8, sad_8x8x4d_sse2),
-                        make_tuple(8, 4, sad_8x4x4d_sse2)));
+const SadMxNx4Func sad64x64x4d_sse2 = vpx_sad64x64x4d_sse2;
+const SadMxNx4Func sad64x32x4d_sse2 = vpx_sad64x32x4d_sse2;
+const SadMxNx4Func sad32x64x4d_sse2 = vpx_sad32x64x4d_sse2;
+const SadMxNx4Func sad32x32x4d_sse2 = vpx_sad32x32x4d_sse2;
+const SadMxNx4Func sad32x16x4d_sse2 = vpx_sad32x16x4d_sse2;
+const SadMxNx4Func sad16x32x4d_sse2 = vpx_sad16x32x4d_sse2;
+const SadMxNx4Func sad16x16x4d_sse2 = vpx_sad16x16x4d_sse2;
+const SadMxNx4Func sad16x8x4d_sse2 = vpx_sad16x8x4d_sse2;
+const SadMxNx4Func sad8x16x4d_sse2 = vpx_sad8x16x4d_sse2;
+const SadMxNx4Func sad8x8x4d_sse2 = vpx_sad8x8x4d_sse2;
+const SadMxNx4Func sad8x4x4d_sse2 = vpx_sad8x4x4d_sse2;
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNx4Func highbd_sad64x64x4d_sse2 = vpx_highbd_sad64x64x4d_sse2;
+const SadMxNx4Func highbd_sad64x32x4d_sse2 = vpx_highbd_sad64x32x4d_sse2;
+const SadMxNx4Func highbd_sad32x64x4d_sse2 = vpx_highbd_sad32x64x4d_sse2;
+const SadMxNx4Func highbd_sad32x32x4d_sse2 = vpx_highbd_sad32x32x4d_sse2;
+const SadMxNx4Func highbd_sad32x16x4d_sse2 = vpx_highbd_sad32x16x4d_sse2;
+const SadMxNx4Func highbd_sad16x32x4d_sse2 = vpx_highbd_sad16x32x4d_sse2;
+const SadMxNx4Func highbd_sad16x16x4d_sse2 = vpx_highbd_sad16x16x4d_sse2;
+const SadMxNx4Func highbd_sad16x8x4d_sse2 = vpx_highbd_sad16x8x4d_sse2;
+const SadMxNx4Func highbd_sad8x16x4d_sse2 = vpx_highbd_sad8x16x4d_sse2;
+const SadMxNx4Func highbd_sad8x8x4d_sse2 = vpx_highbd_sad8x8x4d_sse2;
+const SadMxNx4Func highbd_sad8x4x4d_sse2 = vpx_highbd_sad8x4x4d_sse2;
+const SadMxNx4Func highbd_sad4x8x4d_sse2 = vpx_highbd_sad4x8x4d_sse2;
+const SadMxNx4Func highbd_sad4x4x4d_sse2 = vpx_highbd_sad4x4x4d_sse2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+const SadMxNx4Param x4d_sse2_tests[] = {
+  make_tuple(64, 64, sad64x64x4d_sse2, -1),
+  make_tuple(64, 32, sad64x32x4d_sse2, -1),
+  make_tuple(32, 64, sad32x64x4d_sse2, -1),
+  make_tuple(32, 32, sad32x32x4d_sse2, -1),
+  make_tuple(32, 16, sad32x16x4d_sse2, -1),
+  make_tuple(16, 32, sad16x32x4d_sse2, -1),
+  make_tuple(16, 16, sad16x16x4d_sse2, -1),
+  make_tuple(16, 8, sad16x8x4d_sse2, -1),
+  make_tuple(8, 16, sad8x16x4d_sse2, -1),
+  make_tuple(8, 8, sad8x8x4d_sse2, -1),
+  make_tuple(8, 4, sad8x4x4d_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 8),
+  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 8),
+  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 8),
+  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 8),
+  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 8),
+  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 8),
+  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 8),
+  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 8),
+  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 8),
+  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 8),
+  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 8),
+  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 8),
+  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 8),
+  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 10),
+  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 10),
+  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 10),
+  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 10),
+  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 10),
+  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 10),
+  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 10),
+  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 10),
+  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 10),
+  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 10),
+  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 10),
+  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 10),
+  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 10),
+  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 12),
+  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 12),
+  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 12),
+  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 12),
+  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 12),
+  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 12),
+  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 12),
+  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 12),
+  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 12),
+  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 12),
+  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 12),
+  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 12),
+  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 #endif  // CONFIG_USE_X86INC
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
-#if CONFIG_VP8_ENCODER
-const SadMxNx4Func sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3;
-const SadMxNx4Func sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3;
-const SadMxNx4Func sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3;
-const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
-const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
-INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16x4d_sse3),
-                        make_tuple(16, 8, sad_16x8x4d_sse3),
-                        make_tuple(8, 16, sad_8x16x4d_sse3),
-                        make_tuple(8, 8, sad_8x8x4d_sse3),
-                        make_tuple(4, 4, sad_4x4x4d_sse3)));
-#endif  // CONFIG_VP8_ENCODER
+// Only functions are x3, which do not have tests.
 #endif  // HAVE_SSE3
 
 #if HAVE_SSSE3
-#if CONFIG_USE_X86INC
-#if CONFIG_VP8_ENCODER
-const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3;
-INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_sse3)));
-#endif  // CONFIG_VP8_ENCODER
-#endif  // CONFIG_USE_X86INC
+// Only functions are x3, which do not have tests.
 #endif  // HAVE_SSSE3
 
+#if HAVE_SSE4_1
+// Only functions are x8, which do not have tests.
+#endif  // HAVE_SSE4_1
+
 #if HAVE_AVX2
-#if CONFIG_VP9_ENCODER
-const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
-const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
-INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
-                        make_tuple(32, 32, sad_32x32x4d_avx2),
-                        make_tuple(64, 64, sad_64x64x4d_avx2)));
-#endif  // CONFIG_VP9_ENCODER
+const SadMxNFunc sad64x64_avx2 = vpx_sad64x64_avx2;
+const SadMxNFunc sad64x32_avx2 = vpx_sad64x32_avx2;
+const SadMxNFunc sad32x64_avx2 = vpx_sad32x64_avx2;
+const SadMxNFunc sad32x32_avx2 = vpx_sad32x32_avx2;
+const SadMxNFunc sad32x16_avx2 = vpx_sad32x16_avx2;
+const SadMxNParam avx2_tests[] = {
+  make_tuple(64, 64, sad64x64_avx2, -1),
+  make_tuple(64, 32, sad64x32_avx2, -1),
+  make_tuple(32, 64, sad32x64_avx2, -1),
+  make_tuple(32, 32, sad32x32_avx2, -1),
+  make_tuple(32, 16, sad32x16_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadMxNAvgFunc sad64x64_avg_avx2 = vpx_sad64x64_avg_avx2;
+const SadMxNAvgFunc sad64x32_avg_avx2 = vpx_sad64x32_avg_avx2;
+const SadMxNAvgFunc sad32x64_avg_avx2 = vpx_sad32x64_avg_avx2;
+const SadMxNAvgFunc sad32x32_avg_avx2 = vpx_sad32x32_avg_avx2;
+const SadMxNAvgFunc sad32x16_avg_avx2 = vpx_sad32x16_avg_avx2;
+const SadMxNAvgParam avg_avx2_tests[] = {
+  make_tuple(64, 64, sad64x64_avg_avx2, -1),
+  make_tuple(64, 32, sad64x32_avg_avx2, -1),
+  make_tuple(32, 64, sad32x64_avg_avx2, -1),
+  make_tuple(32, 32, sad32x32_avg_avx2, -1),
+  make_tuple(32, 16, sad32x16_avg_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+
+const SadMxNx4Func sad64x64x4d_avx2 = vpx_sad64x64x4d_avx2;
+const SadMxNx4Func sad32x32x4d_avx2 = vpx_sad32x32x4d_avx2;
+const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(64, 64, sad64x64x4d_avx2, -1),
+  make_tuple(32, 32, sad32x32x4d_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
 
+//------------------------------------------------------------------------------
+// MIPS functions
+#if HAVE_MSA
+const SadMxNFunc sad64x64_msa = vpx_sad64x64_msa;
+const SadMxNFunc sad64x32_msa = vpx_sad64x32_msa;
+const SadMxNFunc sad32x64_msa = vpx_sad32x64_msa;
+const SadMxNFunc sad32x32_msa = vpx_sad32x32_msa;
+const SadMxNFunc sad32x16_msa = vpx_sad32x16_msa;
+const SadMxNFunc sad16x32_msa = vpx_sad16x32_msa;
+const SadMxNFunc sad16x16_msa = vpx_sad16x16_msa;
+const SadMxNFunc sad16x8_msa = vpx_sad16x8_msa;
+const SadMxNFunc sad8x16_msa = vpx_sad8x16_msa;
+const SadMxNFunc sad8x8_msa = vpx_sad8x8_msa;
+const SadMxNFunc sad8x4_msa = vpx_sad8x4_msa;
+const SadMxNFunc sad4x8_msa = vpx_sad4x8_msa;
+const SadMxNFunc sad4x4_msa = vpx_sad4x4_msa;
+const SadMxNParam msa_tests[] = {
+  make_tuple(64, 64, sad64x64_msa, -1),
+  make_tuple(64, 32, sad64x32_msa, -1),
+  make_tuple(32, 64, sad32x64_msa, -1),
+  make_tuple(32, 32, sad32x32_msa, -1),
+  make_tuple(32, 16, sad32x16_msa, -1),
+  make_tuple(16, 32, sad16x32_msa, -1),
+  make_tuple(16, 16, sad16x16_msa, -1),
+  make_tuple(16, 8, sad16x8_msa, -1),
+  make_tuple(8, 16, sad8x16_msa, -1),
+  make_tuple(8, 8, sad8x8_msa, -1),
+  make_tuple(8, 4, sad8x4_msa, -1),
+  make_tuple(4, 8, sad4x8_msa, -1),
+  make_tuple(4, 4, sad4x4_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+
+const SadMxNAvgFunc sad64x64_avg_msa = vpx_sad64x64_avg_msa;
+const SadMxNAvgFunc sad64x32_avg_msa = vpx_sad64x32_avg_msa;
+const SadMxNAvgFunc sad32x64_avg_msa = vpx_sad32x64_avg_msa;
+const SadMxNAvgFunc sad32x32_avg_msa = vpx_sad32x32_avg_msa;
+const SadMxNAvgFunc sad32x16_avg_msa = vpx_sad32x16_avg_msa;
+const SadMxNAvgFunc sad16x32_avg_msa = vpx_sad16x32_avg_msa;
+const SadMxNAvgFunc sad16x16_avg_msa = vpx_sad16x16_avg_msa;
+const SadMxNAvgFunc sad16x8_avg_msa = vpx_sad16x8_avg_msa;
+const SadMxNAvgFunc sad8x16_avg_msa = vpx_sad8x16_avg_msa;
+const SadMxNAvgFunc sad8x8_avg_msa = vpx_sad8x8_avg_msa;
+const SadMxNAvgFunc sad8x4_avg_msa = vpx_sad8x4_avg_msa;
+const SadMxNAvgFunc sad4x8_avg_msa = vpx_sad4x8_avg_msa;
+const SadMxNAvgFunc sad4x4_avg_msa = vpx_sad4x4_avg_msa;
+const SadMxNAvgParam avg_msa_tests[] = {
+  make_tuple(64, 64, sad64x64_avg_msa, -1),
+  make_tuple(64, 32, sad64x32_avg_msa, -1),
+  make_tuple(32, 64, sad32x64_avg_msa, -1),
+  make_tuple(32, 32, sad32x32_avg_msa, -1),
+  make_tuple(32, 16, sad32x16_avg_msa, -1),
+  make_tuple(16, 32, sad16x32_avg_msa, -1),
+  make_tuple(16, 16, sad16x16_avg_msa, -1),
+  make_tuple(16, 8, sad16x8_avg_msa, -1),
+  make_tuple(8, 16, sad8x16_avg_msa, -1),
+  make_tuple(8, 8, sad8x8_avg_msa, -1),
+  make_tuple(8, 4, sad8x4_avg_msa, -1),
+  make_tuple(4, 8, sad4x8_avg_msa, -1),
+  make_tuple(4, 4, sad4x4_avg_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+
+const SadMxNx4Func sad64x64x4d_msa = vpx_sad64x64x4d_msa;
+const SadMxNx4Func sad64x32x4d_msa = vpx_sad64x32x4d_msa;
+const SadMxNx4Func sad32x64x4d_msa = vpx_sad32x64x4d_msa;
+const SadMxNx4Func sad32x32x4d_msa = vpx_sad32x32x4d_msa;
+const SadMxNx4Func sad32x16x4d_msa = vpx_sad32x16x4d_msa;
+const SadMxNx4Func sad16x32x4d_msa = vpx_sad16x32x4d_msa;
+const SadMxNx4Func sad16x16x4d_msa = vpx_sad16x16x4d_msa;
+const SadMxNx4Func sad16x8x4d_msa = vpx_sad16x8x4d_msa;
+const SadMxNx4Func sad8x16x4d_msa = vpx_sad8x16x4d_msa;
+const SadMxNx4Func sad8x8x4d_msa = vpx_sad8x8x4d_msa;
+const SadMxNx4Func sad8x4x4d_msa = vpx_sad8x4x4d_msa;
+const SadMxNx4Func sad4x8x4d_msa = vpx_sad4x8x4d_msa;
+const SadMxNx4Func sad4x4x4d_msa = vpx_sad4x4x4d_msa;
+const SadMxNx4Param x4d_msa_tests[] = {
+  make_tuple(64, 64, sad64x64x4d_msa, -1),
+  make_tuple(64, 32, sad64x32x4d_msa, -1),
+  make_tuple(32, 64, sad32x64x4d_msa, -1),
+  make_tuple(32, 32, sad32x32x4d_msa, -1),
+  make_tuple(32, 16, sad32x16x4d_msa, -1),
+  make_tuple(16, 32, sad16x32x4d_msa, -1),
+  make_tuple(16, 16, sad16x16x4d_msa, -1),
+  make_tuple(16, 8, sad16x8x4d_msa, -1),
+  make_tuple(8, 16, sad8x16x4d_msa, -1),
+  make_tuple(8, 8, sad8x8x4d_msa, -1),
+  make_tuple(8, 4, sad8x4x4d_msa, -1),
+  make_tuple(4, 8, sad4x8x4d_msa, -1),
+  make_tuple(4, 4, sad4x4x4d_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+#endif  // HAVE_MSA
+
 }  // namespace

diff --git a/libvpx/test/set_maps.sh b/libvpx/test/set_maps.sh
new file mode 100755
index 0000000..e7c8d43
--- /dev/null
+++ b/libvpx/test/set_maps.sh

@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx set_maps example. To add new tests to this file,
+##  do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBVPX_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(vpx_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    set_maps vp8 || return 1
+  fi
+}
+
+set_maps_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    set_maps vp9 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_vp8
+                set_maps_vp9"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"

diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc
index 5b054f4..fea8cca 100644
--- a/libvpx/test/set_roi.cc
+++ b/libvpx/test/set_roi.cc

@@ -53,7 +53,7 @@
   cpi.common.mb_rows = 240 >> 4;
   cpi.common.mb_cols = 320 >> 4;
   const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols);
-  vpx_memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
+  memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
 
   // Segment map
   cpi.segmentation_map = reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
@@ -61,9 +61,9 @@
   // Allocate memory for the source memory map.
   unsigned char *roi_map =
     reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
-  vpx_memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
-  vpx_memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
-  vpx_memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
+  memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
+  memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
+  memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
 
   // Do a test call with valid parameters.
   int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,

diff --git a/libvpx/test/sixtap_predict_test.cc b/libvpx/test/sixtap_predict_test.cc
index 1e6d915..8c7c98d 100644
--- a/libvpx/test/sixtap_predict_test.cc
+++ b/libvpx/test/sixtap_predict_test.cc

@@ -11,13 +11,15 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -238,4 +240,16 @@
         make_tuple(8, 4, sixtap_8x4_ssse3),
         make_tuple(4, 4, sixtap_4x4_ssse3)));
 #endif
+#if HAVE_MSA
+const SixtapPredictFunc sixtap_16x16_msa = vp8_sixtap_predict16x16_msa;
+const SixtapPredictFunc sixtap_8x8_msa = vp8_sixtap_predict8x8_msa;
+const SixtapPredictFunc sixtap_8x4_msa = vp8_sixtap_predict8x4_msa;
+const SixtapPredictFunc sixtap_4x4_msa = vp8_sixtap_predict4x4_msa;
+INSTANTIATE_TEST_CASE_P(
+    MSA, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_msa),
+        make_tuple(8, 8, sixtap_8x8_msa),
+        make_tuple(8, 4, sixtap_8x4_msa),
+        make_tuple(4, 4, sixtap_4x4_msa)));
+#endif
 }  // namespace

diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc
deleted file mode 100644
index 6619fb1..0000000
--- a/libvpx/test/subtract_test.cc
+++ /dev/null

@@ -1,123 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vp8/encoder/block.h"
-#include "vpx_mem/vpx_mem.h"
-
-typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
-
-namespace {
-
-class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
- public:
-  virtual void TearDown() {
-    libvpx_test::ClearSystemState();
-  }
-};
-
-using libvpx_test::ACMRandom;
-
-TEST_P(SubtractBlockTest, SimpleSubtract) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  BLOCK be;
-  BLOCKD bd;
-  // in libvpx, this stride is always 16
-  const int kDiffPredStride = 16;
-  const int kSrcStride[] = {32, 16, 8, 4, 0};
-  const int kBlockWidth = 4;
-  const int kBlockHeight = 4;
-
-  // Allocate... align to 16 for mmx/sse tests
-  uint8_t *source = reinterpret_cast<uint8_t*>(
-      vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
-  be.src_diff = reinterpret_cast<int16_t*>(
-      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
-  bd.predictor = reinterpret_cast<unsigned char*>(
-      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
-
-  for (int i = 0; kSrcStride[i] > 0; ++i) {
-    // start at block0
-    be.src = 0;
-    be.base_src = &source;
-    be.src_stride = kSrcStride[i];
-
-    // set difference
-    int16_t *src_diff = be.src_diff;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = static_cast<int16_t>(0xa5a5u);
-      }
-      src_diff += kDiffPredStride;
-    }
-
-    // set destination
-    uint8_t *base_src = *be.base_src;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        base_src[c] = rnd.Rand8();
-      }
-      base_src += be.src_stride;
-    }
-
-    // set predictor
-    uint8_t *predictor = bd.predictor;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        predictor[c] = rnd.Rand8();
-      }
-      predictor += kDiffPredStride;
-    }
-
-    ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
-
-    base_src = *be.base_src;
-    src_diff = be.src_diff;
-    predictor = bd.predictor;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
-                                                             << ", c = " << c;
-      }
-      src_diff += kDiffPredStride;
-      predictor += kDiffPredStride;
-      base_src += be.src_stride;
-    }
-  }
-  vpx_free(be.src_diff);
-  vpx_free(source);
-  vpx_free(bd.predictor);
-}
-
-INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_c));
-
-#if HAVE_NEON_ASM
-INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_neon));
-#endif
-
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_mmx));
-#endif
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_sse2));
-#endif
-
-}  // namespace

diff --git a/libvpx/test/superframe_test.cc b/libvpx/test/superframe_test.cc
index c0f542d..a8102b7 100644
--- a/libvpx/test/superframe_test.cc
+++ b/libvpx/test/superframe_test.cc

@@ -94,4 +94,7 @@
 
 VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
     ::libvpx_test::kTwoPassGood));
+
+VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
 }  // namespace

diff --git a/libvpx/test/svc_test.cc b/libvpx/test/svc_test.cc
index e9cf38d..b955cee 100644
--- a/libvpx/test/svc_test.cc
+++ b/libvpx/test/svc_test.cc

@@ -13,6 +13,9 @@
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/i420_video_source.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
@@ -21,6 +24,7 @@
 
 using libvpx_test::CodecFactory;
 using libvpx_test::Decoder;
+using libvpx_test::DxDataIterator;
 using libvpx_test::VP9CodecFactory;
 
 class SvcTest : public ::testing::Test {
@@ -56,15 +60,254 @@
     codec_enc_.kf_min_dist = 100;
     codec_enc_.kf_max_dist = 100;
 
-    vpx_codec_dec_cfg_t dec_cfg = {0};
+    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+
+    tile_columns_ = 0;
+    tile_rows_ = 0;
   }
 
   virtual void TearDown() {
-    vpx_svc_release(&svc_);
+    ReleaseEncoder();
     delete(decoder_);
+  }
+
+  void InitializeEncoder() {
+    const vpx_codec_err_t res =
+        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
+    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
+    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
+    codec_initialized_ = true;
+  }
+
+  void ReleaseEncoder() {
+    vpx_svc_release(&svc_);
     if (codec_initialized_) vpx_codec_destroy(&codec_);
+    codec_initialized_ = false;
+  }
+
+  void GetStatsData(std::string *const stats_buf) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
+        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
+        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
+        stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
+                          cx_pkt->data.twopass_stats.sz);
+      }
+    }
+  }
+
+  void Pass1EncodeNFrames(const int n, const int layers,
+                          std::string *const stats_buf) {
+    vpx_codec_err_t res;
+
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      GetStatsData(stats_buf);
+      video.Next();
+    }
+
+    // Flush encoder and test EOS packet.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    ASSERT_EQ(VPX_CODEC_OK, res);
+    GetStatsData(stats_buf);
+
+    ReleaseEncoder();
+  }
+
+  void StoreFrames(const size_t max_frame_received,
+                   struct vpx_fixed_buf *const outputs,
+                   size_t *const frame_received) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+        const size_t frame_size = cx_pkt->data.frame.sz;
+
+        EXPECT_GT(frame_size, 0U);
+        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
+        ASSERT_LT(*frame_received, max_frame_received);
+
+        if (*frame_received == 0)
+          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
+
+        outputs[*frame_received].buf = malloc(frame_size + 16);
+        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
+               frame_size);
+        outputs[*frame_received].sz = frame_size;
+        ++(*frame_received);
+      }
+    }
+  }
+
+  void Pass2EncodeNFrames(std::string *const stats_buf,
+                          const int n, const int layers,
+                          struct vpx_fixed_buf *const outputs) {
+    vpx_codec_err_t res;
+    size_t frame_received = 0;
+
+    ASSERT_TRUE(outputs != NULL);
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.rc_target_bitrate = 500;
+    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
+      ASSERT_TRUE(stats_buf != NULL);
+      ASSERT_GT(stats_buf->size(), 0U);
+      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
+      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
+    }
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      StoreFrames(n, outputs, &frame_received);
+      video.Next();
+    }
+
+    // Flush encoder.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    StoreFrames(n, outputs, &frame_received);
+
+    EXPECT_EQ(frame_received, static_cast<size_t>(n));
+
+    ReleaseEncoder();
+  }
+
+  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
+    int decoded_frames = 0;
+    int received_frames = 0;
+
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+      const vpx_codec_err_t res_dec =
+          decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
+                                inputs[i].sz);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+      ++decoded_frames;
+
+      DxDataIterator dec_iter = decoder_->GetDxData();
+      while (dec_iter.Next() != NULL) {
+        ++received_frames;
+      }
+    }
+    EXPECT_EQ(decoded_frames, n);
+    EXPECT_EQ(received_frames, n);
+  }
+
+  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
+                             const int num_super_frames,
+                             const int remained_spatial_layers) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(num_super_frames, 0);
+    ASSERT_GT(remained_spatial_layers, 0);
+
+    for (int i = 0; i < num_super_frames; ++i) {
+      uint32_t frame_sizes[8] = {0};
+      int frame_count = 0;
+      int frames_found = 0;
+      int frame;
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+
+      vpx_codec_err_t res =
+          vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
+                                     inputs[i].sz, frame_sizes, &frame_count,
+                                     NULL, NULL);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+
+      if (frame_count == 0) {
+        // There's no super frame but only a single frame.
+        ASSERT_EQ(1, remained_spatial_layers);
+      } else {
+        // Found a super frame.
+        uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+        uint8_t *frame_start = frame_data;
+        for (frame = 0; frame < frame_count; ++frame) {
+          // Looking for a visible frame.
+          if (frame_data[0] & 0x02) {
+            ++frames_found;
+            if (frames_found == remained_spatial_layers)
+              break;
+          }
+          frame_data += frame_sizes[frame];
+        }
+        ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+            << "remained_spatial_layers: " << remained_spatial_layers
+            << "    super_frame: " << i;
+        if (frame == frame_count - 1)
+          continue;
+
+        frame_data += frame_sizes[frame];
+
+        // We need to add one more frame for multiple frame contexts.
+        uint8_t marker =
+            static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
+        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+        const size_t index_sz = 2 + mag * frame_count;
+        const size_t new_index_sz = 2 + mag * (frame + 1);
+        marker &= 0x0f8;
+        marker |= frame;
+
+        // Copy existing frame sizes.
+        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
+                new_index_sz - 2);
+        // New marker.
+        frame_data[0] = marker;
+        frame_data += (mag * (frame + 1) + 1);
+
+        *frame_data++ = marker;
+        inputs[i].sz = frame_data - frame_start;
+      }
+    }
+  }
+
+  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      free(inputs[i].buf);
+      inputs[i].buf = NULL;
+      inputs[i].sz = 0;
+    }
   }
 
   SvcContext svc_;
@@ -74,6 +317,8 @@
   std::string test_file_name_;
   bool codec_initialized_;
   Decoder *decoder_;
+  int tile_columns_;
+  int tile_rows_;
 };
 
 TEST_F(SvcTest, SvcInit) {
@@ -93,22 +338,13 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   svc_.spatial_layers = 0;  // use default layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
 }
 
 TEST_F(SvcTest, InitTwoLayers) {
   svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16*16");  // invalid scale values
-  vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");  // valid scale values
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, InvalidOptions) {
@@ -122,20 +358,17 @@
 }
 
 TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(3, svc_.spatial_layers);
 }
 
 TEST_F(SvcTest, SetMultipleOptions) {
   vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(2, svc_.spatial_layers);
 }
 
@@ -147,24 +380,50 @@
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  vpx_svc_set_options(&svc_, "quantizers=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, SetQuantizersOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetAutoAltRefOption) {
@@ -180,324 +439,359 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, "40,30");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetScaleFactors) {
-  vpx_codec_err_t res = vpx_svc_set_scale_factors(NULL, "4/16,16/16");
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_scale_factors(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_scale_factors(&svc_, "4/16");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 // Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, FirstFrameHasLayers) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  video.Begin();
-
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  if (vpx_svc_get_frame_size(&svc_) == 0) {
-    // Flush encoder
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                         video.duration(), VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-  }
-
-  int frame_size = vpx_svc_get_frame_size(&svc_);
-  EXPECT_GT(frame_size, 0);
-  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-
-  // this test fails with a decoder error
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+TEST_F(SvcTest, OnePassEncodeOneFrame) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf output = {0};
+  Pass2EncodeNFrames(NULL, 1, 2, &output);
+  DecodeNFrames(&output, 1);
+  FreeBitstreamBuffers(&output, 1);
 }
 
-TEST_F(SvcTest, EncodeThreeFrames) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  EXPECT_EQ(decoded_frames, 3);
+TEST_F(SvcTest, OnePassEncodeThreeFrames) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  codec_enc_.g_lag_in_frames = 0;
+  vpx_fixed_buf outputs[3];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 3);
+  FreeBitstreamBuffers(&outputs[0], 3);
 }
 
-TEST_F(SvcTest, GetLayerResolution) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  // ensure that requested layer is a valid layer
-  uint32_t layer_width, layer_height;
-  res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
-                                     &layer_width, &layer_height);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_get_layer_resolution(NULL, 0, &layer_width, &layer_height);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_get_layer_resolution(&svc_, 0, NULL, &layer_height);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_get_layer_resolution(&svc_, 0, &layer_width, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_get_layer_resolution(&svc_, 0, &layer_width, &layer_height);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(kWidth * 4 / 16, layer_width);
-  EXPECT_EQ(kHeight * 4 / 16, layer_height);
-
-  res = vpx_svc_get_layer_resolution(&svc_, 1, &layer_width, &layer_height);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(kWidth * 8 / 16, layer_width);
-  EXPECT_EQ(kHeight * 8 / 16, layer_height);
-}
-
-TEST_F(SvcTest, TwoPassEncode) {
+TEST_F(SvcTest, TwoPassEncode10Frames) {
   // First pass encode
   std::string stats_buf;
-  svc_.spatial_layers = 2;
-  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // FRAME 1
-  video.Next();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Flush encoder and test EOS packet
-  res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Tear down encoder
-  vpx_svc_release(&svc_);
-  vpx_codec_destroy(&codec_);
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
   // Second pass encode
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
   codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(20, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
   vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
-  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
 
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 5, &stats_buf);
 
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 4);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 3);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
 
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(20, 2, &stats_buf);
 
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1 scale-factors=1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(20, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 2);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 1);
+  DecodeNFrames(&outputs[0], 20);
+
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, SetMultipleFrameContextsOption) {
+  svc_.spatial_layers = 5;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
   EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  InitializeEncoder();
+}
 
-  EXPECT_EQ(decoded_frames, 3);
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(10, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
 }
 
 }  // namespace

diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk
new file mode 100644
index 0000000..dda1c18
--- /dev/null
+++ b/libvpx/test/test-data.mk

@@ -0,0 +1,770 @@
+LIBVPX_TEST_SRCS-yes += test-data.mk
+
+# Encoder test source
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+
+# Test vectors
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm.md5
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+
+ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# Encode / Decode test
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+# BBB VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+# Sintel VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+# TOS VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_426x178_tile_1x1_181kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_640x266_tile_1x2_336kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_656kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+endif  # CONFIG_DECODE_PERF_TESTS
+
+ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
+endif  # CONFIG_ENCODE_PERF_TESTS
+
+# sort and remove duplicates
+LIBVPX_TEST_DATA-yes := $(sort $(LIBVPX_TEST_DATA-yes))

diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index ee6289f..3590f4e 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1

@@ -1,683 +1,745 @@
-d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
-b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
-76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
-7448d8798a4380162d4b56f9b452e2f6f9e24e7a  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
-83f50908c8dc0ef8760595447a2ff7727489542e  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
-456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
-c123d1f9f02fb4143abb5e271916e3a3080de8f6  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
-456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
-fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01-v2.webm
-25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01-v2.webm.res
-d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02-v2.webm
-8e2eff4af87d2b561cce2365713269e301457ef3  invalid-vp90-02-v2.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v2.webm
-25dd58c22d23f75304d7ce7f69f4e5b02ef9119a  invalid-vp90-03-v2.webm.res
-d637297561dd904eb2c97a9015deeb31c4a1e8d2  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
-3a204bdbeaa3c6458b77bcebb8366d107267f55d  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
-a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
-0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
-ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
-614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
-c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
-b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
-4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
-7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
-bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
-b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
-5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
-65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
-906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf
-ec144b1af53af895db78355785650b96dd3f0ade  vp80-00-comprehensive-004.ivf
-afc7091785c62f1c121c4554a2830c30704587d9  vp80-00-comprehensive-005.ivf
-42ea9d55c818145d06a9b633b8e85c6a6164fd3e  vp80-00-comprehensive-006.ivf
-e5b3a73ab79fe024c14309d653d6bed92902ee3b  vp80-00-comprehensive-007.ivf
-f3c50a58875930adfb84525c0ef59d7e4c08540c  vp80-00-comprehensive-008.ivf
-4b2841fdb83db51ae322096ae468bbb9dc2c8362  vp80-00-comprehensive-009.ivf
-efbff736e3a91ab6a98c5bc2dce65d645944c7b1  vp80-00-comprehensive-010.ivf
-6b315102cae008d22a3d2c231be92cb704a222f8  vp80-00-comprehensive-011.ivf
-f3214a4fea14c2d5ec689936c1613f274c859ee8  vp80-00-comprehensive-012.ivf
-e4094e96d308c8a35b74c480a43d853c5294cd34  vp80-00-comprehensive-013.ivf
-5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a  vp80-00-comprehensive-014.ivf
-e8467688ddf26b5000664f904faf0d70506aa653  vp80-00-comprehensive-015.ivf
-aab55582337dfd2a39ff54fb2576a91910d49337  vp80-00-comprehensive-016.ivf
-1ba24724f80203c9bae4f1d0f99d534721980016  vp80-00-comprehensive-017.ivf
-143a15512b46f436280ddb4d0e6411eb4af434f2  vp80-00-comprehensive-018.ivf
-c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b  vp80-01-intra-1400.ivf
-f383955229afe3408453e316d11553d923ca60d5  vp80-01-intra-1411.ivf
-84e1f4343f174c9f3c83f834bac3196fb325bf2c  vp80-01-intra-1416.ivf
-fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51  vp80-01-intra-1417.ivf
-71ea772d3e9d315b8cbecf41207b8a237c34853b  vp80-02-inter-1402.ivf
-d85dbc4271525dcd128c503f936fe69091d1f8d0  vp80-02-inter-1412.ivf
-d4e5d3ad56511867d025f93724d090f92ba6ec3d  vp80-02-inter-1418.ivf
-91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa  vp80-02-inter-1424.ivf
-17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02  vp80-03-segmentation-01.ivf
-3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb  vp80-03-segmentation-02.ivf
-c156778d5340967d4b369c490848076e92f1f875  vp80-03-segmentation-03.ivf
-d25dcff6c60e87a1af70945b8911b6b4998533b0  vp80-03-segmentation-04.ivf
-362baba2ce454c9db21218f35e81c27a5ed0b730  vp80-03-segmentation-1401.ivf
-d223ae7ee748ce07e74c4679bfd219e84aa9f4b0  vp80-03-segmentation-1403.ivf
-033adf7f3a13836a3f1cffcb87c1972900f2b5c6  vp80-03-segmentation-1407.ivf
-4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f  vp80-03-segmentation-1408.ivf
-f37a62b197c2600d75e0ccfbb31b60efdedac251  vp80-03-segmentation-1409.ivf
-eb25bd7bfba5b2f6935018a930f42d123b1e7fcd  vp80-03-segmentation-1410.ivf
-b9d5c436663a30c27cfff84b53a002e501258843  vp80-03-segmentation-1413.ivf
-6da92b9d1a180cc3a8afe348ab12258f5a37be1a  vp80-03-segmentation-1414.ivf
-a4f5842602886bd669f115f93d8a35c035cb0948  vp80-03-segmentation-1415.ivf
-f295dceb8ef278b77251b3f9df8aee22e161d547  vp80-03-segmentation-1425.ivf
-198dbf9f36f733200e432664cc8c5752d59779de  vp80-03-segmentation-1426.ivf
-7704804e32f5de976803929934a7fafe101ac7b0  vp80-03-segmentation-1427.ivf
-831ccd862ea95ca025d2f3bd8b88678752f5416d  vp80-03-segmentation-1432.ivf
-b3c11978529289f9109f2766fcaba3ebc40e11ef  vp80-03-segmentation-1435.ivf
-a835a731f5520ebfc1002c40121264d0020559ac  vp80-03-segmentation-1436.ivf
-1d1732942f773bb2a5775fcb9689b1579ce28eab  vp80-03-segmentation-1437.ivf
-db04799adfe089dfdf74dbd43cc05ede7161f99e  vp80-03-segmentation-1441.ivf
-7caf39b3f20cfd52b998210878062e52a5edf1e6  vp80-03-segmentation-1442.ivf
-3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261  vp80-04-partitions-1404.ivf
-93cc323b6b6867f1b12dd48773424549c6960a6b  vp80-04-partitions-1405.ivf
-047eedb14b865bdac8a3538e63801054e0295e9c  vp80-04-partitions-1406.ivf
-0f1233bd2bc33f56ce5e495dbd455d122339f384  vp80-05-sharpness-1428.ivf
-51767fc136488a9535c2a4c38067c542ee2048df  vp80-05-sharpness-1429.ivf
-9805aa107672de25d6fb8c35e20d06deca5efe18  vp80-05-sharpness-1430.ivf
-61db6b965f9c27aebe71b85bf2d5877e58e4bbdf  vp80-05-sharpness-1431.ivf
-10420d266290d2923555f84af38eeb96edbd3ae8  vp80-05-sharpness-1433.ivf
-3ed24f9a80cddfdf75824ba95cdb4ff9286cb443  vp80-05-sharpness-1434.ivf
-c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163  vp80-05-sharpness-1438.ivf
-aff51d865c2621b60510459244ea83e958e4baed  vp80-05-sharpness-1439.ivf
-da386e72b19b5485a6af199c5eb60ef25e510dd1  vp80-05-sharpness-1440.ivf
-6759a095203d96ccd267ce09b1b050b8cc4c2f1f  vp80-05-sharpness-1443.ivf
-b95d3cc1d0df991e63e150a801710a72f20d9ba0  vp80-06-smallsize.ivf
-db55ec7fd02c864ba996ff060b25b1e08611330b  vp80-00-comprehensive-001.ivf.md5
-29db0ad011cba1e45f856d5623cd38dac3e3bf19  vp80-00-comprehensive-002.ivf.md5
-e84f258f69e173e7d68f8f8c037a0a3766902182  vp80-00-comprehensive-003.ivf.md5
-eb7912eaf69559a16fd82bc3f5fb1524cf4a4466  vp80-00-comprehensive-004.ivf.md5
-4206f71c94894bd5b5b376f6c09b3817dbc65206  vp80-00-comprehensive-005.ivf.md5
-4f89b356f6f2fecb928f330a10f804f00f5325f5  vp80-00-comprehensive-006.ivf.md5
-2813236a32964dd8007e17648bcf035a20fcda6c  vp80-00-comprehensive-007.ivf.md5
-10746c72098f872803c900e17c5680e451f5f498  vp80-00-comprehensive-008.ivf.md5
-39a23d0692ce64421a7bb7cdf6ccec5928d37fff  vp80-00-comprehensive-009.ivf.md5
-f6e3de8931a0cc659bda8fbc14050346955e72d4  vp80-00-comprehensive-010.ivf.md5
-101683ec195b6e944f7cd1e468fc8921439363e6  vp80-00-comprehensive-011.ivf.md5
-1f592751ce46d8688998fa0fa4fbdcda0fd4058c  vp80-00-comprehensive-012.ivf.md5
-6066176f90ca790251e795fca1a5797d59999841  vp80-00-comprehensive-013.ivf.md5
-2656da94ba93691f23edc4d60b3a09e2be46c217  vp80-00-comprehensive-014.ivf.md5
-c6e0d5f5d61460c8ac8edfa4e701f10312c03133  vp80-00-comprehensive-015.ivf.md5
-ee60fee501d8493e34e8d6a1fe315b51ed09b24a  vp80-00-comprehensive-016.ivf.md5
-9f1914ceffcad4546c0a29de3ef591d8bea304dc  vp80-00-comprehensive-017.ivf.md5
-e0305178fe288a9fd8082b39e2d03181edb19054  vp80-00-comprehensive-018.ivf.md5
-612494da2fa799cc9d76dcdd835ae6c7cb2e5c05  vp80-01-intra-1400.ivf.md5
-48ea06097ac8269c5e8c2131d3d0639f431fcf0e  vp80-01-intra-1411.ivf.md5
-6e2ab4e7677ad0ba868083ca6bc387ee922b400c  vp80-01-intra-1416.ivf.md5
-eca0a90348959ce3854142f8d8641b13050e8349  vp80-01-intra-1417.ivf.md5
-920feea203145d5c2258a91c4e6991934a79a99e  vp80-02-inter-1402.ivf.md5
-f71d97909fe2b3dd65be7e1f56c72237f0cef200  vp80-02-inter-1412.ivf.md5
-e911254569a30bbb2a237ff8b79f69ed9da0672d  vp80-02-inter-1418.ivf.md5
-58c789c50c9bb9cc90580bed291164a0939d28ba  vp80-02-inter-1424.ivf.md5
-ff3e2f441327b9c20a0b37c524e0f5a48a36de7b  vp80-03-segmentation-01.ivf.md5
-0791f417f076a542ae66fbc3426ab4d94cbd6c75  vp80-03-segmentation-02.ivf.md5
-722e50f1a6a91c34302d68681faffc1c26d1cc57  vp80-03-segmentation-03.ivf.md5
-c701f1885bcfb27fb8e70cc65606b289172ef889  vp80-03-segmentation-04.ivf.md5
-f79bc9ec189a2b4807632a3d0c5bf04a178b5300  vp80-03-segmentation-1401.ivf.md5
-b9aa4c74c0219b639811c44760d0b24cd8bb436a  vp80-03-segmentation-1403.ivf.md5
-70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334  vp80-03-segmentation-1407.ivf.md5
-265f962ee781531f9a93b9309461316fd32b2a1d  vp80-03-segmentation-1408.ivf.md5
-0c4ecbbd6dc042d30e626d951b65f460dd6cd563  vp80-03-segmentation-1409.ivf.md5
-cf779af36a937f06570a0fca9db64ba133451dee  vp80-03-segmentation-1410.ivf.md5
-0e6c5036d51ab078842f133934926c598a9cff02  vp80-03-segmentation-1413.ivf.md5
-eb3930aaf229116c80d507516c34759c3f6cdf69  vp80-03-segmentation-1414.ivf.md5
-123d6c0f72ee87911c4ae7538e87b7d163b22d6c  vp80-03-segmentation-1415.ivf.md5
-e70551d1a38920e097a5d8782390b79ecaeb7505  vp80-03-segmentation-1425.ivf.md5
-44e8f4117e46dbb302b2cfd81171cc1a1846e431  vp80-03-segmentation-1426.ivf.md5
-52636e54aee5f95bbace37021bd67de5db767e9a  vp80-03-segmentation-1427.ivf.md5
-b1ad3eff20215c28e295b15ef3636ed926d59cba  vp80-03-segmentation-1432.ivf.md5
-24c22a552fa28a90e5978f67f57181cc2d7546d7  vp80-03-segmentation-1435.ivf.md5
-96c49c390abfced18a7a8c9b9ea10af778e10edb  vp80-03-segmentation-1436.ivf.md5
-f95eb6214571434f1f73ab7833b9ccdf47588020  vp80-03-segmentation-1437.ivf.md5
-1c0700ca27c9b0090a7747a4b0b4dc21d1843181  vp80-03-segmentation-1441.ivf.md5
-81d4f23ca32667ee958bae579c8f5e97ba72eb97  vp80-03-segmentation-1442.ivf.md5
-272efcef07a3a30fbca51bfd566063d8258ec0be  vp80-04-partitions-1404.ivf.md5
-66ed219ab812ac801b256d35cf495d193d4cf478  vp80-04-partitions-1405.ivf.md5
-36083f37f56f502bd60ec5e07502ee9e6b8699b0  vp80-04-partitions-1406.ivf.md5
-6ca909bf168a64c09415626294665dc1be3d1973  vp80-05-sharpness-1428.ivf.md5
-1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a  vp80-05-sharpness-1429.ivf.md5
-71bcbe5357d36a19df5b07fbe3e27bffa8893f0a  vp80-05-sharpness-1430.ivf.md5
-89a09b1dffce2d55770a89e58d9925c70ef79bf8  vp80-05-sharpness-1431.ivf.md5
-08444a18b4e6ba3450c0796dd728d48c399a2dc9  vp80-05-sharpness-1433.ivf.md5
-6d6223719a90c13e848aa2a8a6642098cdb5977a  vp80-05-sharpness-1434.ivf.md5
-41d70bb5fa45bc88da1604a0af466930b8dd77b5  vp80-05-sharpness-1438.ivf.md5
-086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
-d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
-8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
-d6f246df012c241b5fa6c1345019a3703d85c419  vp80-06-smallsize.ivf.md5
-ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0  vp90-2-00-quantizer-00.webm
-ac5eda33407d0521c7afca43a63fd305c0cd9d13  vp90-2-00-quantizer-00.webm.md5
-2ca0463f2cfb93d25d7dded174db70b7cb87cb48  vp90-2-00-quantizer-01.webm
-10d98884fc6d9a5f47a2057922b8e25dd48d7786  vp90-2-00-quantizer-01.webm.md5
-d80a2920a5e0819d69dcba8fe260c01f820f8982  vp90-2-00-quantizer-02.webm
-c964c8e5e04165fabbf1c6ee8ee5121d35921965  vp90-2-00-quantizer-02.webm.md5
-fdef046777b5b75c962b715d809dbe2ea331afb9  vp90-2-00-quantizer-03.webm
-f270bee0b0c7aa2bf4c5afe098556b4f3f890faf  vp90-2-00-quantizer-03.webm.md5
-66d98609e809394a6ac730787e6724e3badc075a  vp90-2-00-quantizer-04.webm
-427433bfe121c4aea1095ec3124fdc174d200e3a  vp90-2-00-quantizer-04.webm.md5
-e6e42626d8cadf0b5be16313f69212981b96fee5  vp90-2-00-quantizer-05.webm
-c98f6a9a1af4cfd71416792827304266aad4bd46  vp90-2-00-quantizer-05.webm.md5
-413ef09b721f5dcec1a96e937a97e5873c2e6db6  vp90-2-00-quantizer-06.webm
-5080e940a23805c82e578e21b57fc2c511e76376  vp90-2-00-quantizer-06.webm.md5
-4a50a5f4ac717c30dfaae8bb46702e3542e867de  vp90-2-00-quantizer-07.webm
-76c429a02b56762e10ee4db88729d8834b3a70f4  vp90-2-00-quantizer-07.webm.md5
-d2f4e464780bf8b7e647efa18ac777a930e62bc0  vp90-2-00-quantizer-08.webm
-ab94aabf9316111b52d7c531962ed4123313b6ba  vp90-2-00-quantizer-08.webm.md5
-174bc58433936dd79550398d744f1072ce7f5693  vp90-2-00-quantizer-09.webm
-e1f7690cd83ccc56d045e17cce552544a5f03810  vp90-2-00-quantizer-09.webm.md5
-52bc1dfd3a97b24d922eb8a31d07527891561f2a  vp90-2-00-quantizer-10.webm
-9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8  vp90-2-00-quantizer-10.webm.md5
-10031eecafde1e1d8e6323fe2b2a1d7e77a66869  vp90-2-00-quantizer-11.webm
-fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0  vp90-2-00-quantizer-11.webm.md5
-78e9f7bb77e8e348155bbdfa12790789d1d50c34  vp90-2-00-quantizer-12.webm
-0961d060cc8dd469c6dac8d7d75f927c0bb971b8  vp90-2-00-quantizer-12.webm.md5
-133b77a3bbcef652552d74ffc46afbfe3b8a1cba  vp90-2-00-quantizer-13.webm
-df29e5e0f95772af482f540d776f6b9dea4bfa29  vp90-2-00-quantizer-13.webm.md5
-27323afdaf8987e025c27129c74c86502315a206  vp90-2-00-quantizer-14.webm
-ce96a2cc312942f0427a463f15a392870dd69764  vp90-2-00-quantizer-14.webm.md5
-ab58d0b41037829f6bc993910999f4af0212aafd  vp90-2-00-quantizer-15.webm
-40f700db606501aa7cb49049624cbdde6409b122  vp90-2-00-quantizer-15.webm.md5
-cd948e66448aafb65998815ce37241f95d7c9ee7  vp90-2-00-quantizer-16.webm
-039b742d149c945ed79c7b9a6384352852a1c116  vp90-2-00-quantizer-16.webm.md5
-62f56e663e13c576764e491cf08f19bd46a71999  vp90-2-00-quantizer-17.webm
-90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e  vp90-2-00-quantizer-17.webm.md5
-f26ecad7263cd66a614e53ba5d7c00df181affeb  vp90-2-00-quantizer-18.webm
-cda0a1c0fca2ec2976ae55124a8a67305508bae6  vp90-2-00-quantizer-18.webm.md5
-94bfc4c04fcfe139a63b98c569e8c14ba98c401f  vp90-2-00-quantizer-19.webm
-5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f  vp90-2-00-quantizer-19.webm.md5
-0ee88e9318985e1e245de78c2c4a665885ab76a7  vp90-2-00-quantizer-20.webm
-4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063  vp90-2-00-quantizer-20.webm.md5
-6a995cb2b1db33da8087321df1e646f95c3e32d1  vp90-2-00-quantizer-21.webm
-e216b4a1eceac03efcc433759be54ab8ea87b24b  vp90-2-00-quantizer-21.webm.md5
-aa7722fc427e7180115f3c9cd96bb6b2768e7296  vp90-2-00-quantizer-22.webm
-1aa813bd45ae831bf5e79ace4d73dfd25989a07d  vp90-2-00-quantizer-22.webm.md5
-7677e5b929ed6d142041f19b8a9cd5822ee1504a  vp90-2-00-quantizer-23.webm
-0de0af34abd843d5b37e58baf3ed96a6104b64c3  vp90-2-00-quantizer-23.webm.md5
-b2995cbe1128b2d4926f1b28d01c501ecb6be8c8  vp90-2-00-quantizer-24.webm
-db6033af2ba2f2bca62468fb4b8808e474f93923  vp90-2-00-quantizer-24.webm.md5
-8135ba35587fd92cd4667be7896323d9b634401c  vp90-2-00-quantizer-25.webm
-3499e00c2cc15876f61f07e3d3cfca54ebcd98fd  vp90-2-00-quantizer-25.webm.md5
-af0fa2907746db82d345f6d831fcc1b2862a29fb  vp90-2-00-quantizer-26.webm
-cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7  vp90-2-00-quantizer-26.webm.md5
-bd0002e91323776beb5ff11e06edcf19fc08e9b9  vp90-2-00-quantizer-27.webm
-fe72154ef196067d6c272521012dd79706496cac  vp90-2-00-quantizer-27.webm.md5
-fc15eb606f81455ff03df16bf3432296b002c43c  vp90-2-00-quantizer-28.webm
-40b2e24b542206a6bfd746ef199e49ccea07678a  vp90-2-00-quantizer-28.webm.md5
-3090bbf913cad0b2eddca7228f5ed51a58378b8d  vp90-2-00-quantizer-29.webm
-eb59745e0912d8ed6c928268bcf265237c9ba93f  vp90-2-00-quantizer-29.webm.md5
-c615abdca9c25e1cb110d908edbedfb3b7c92b91  vp90-2-00-quantizer-30.webm
-ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0  vp90-2-00-quantizer-30.webm.md5
-037d9f242086cfb085518f6416259defa82d5fc2  vp90-2-00-quantizer-31.webm
-4654b40792572f0a790874c6347ef9196d86c1a7  vp90-2-00-quantizer-31.webm.md5
-505899f3f3515044c5c8b3213d9b9d16f614619d  vp90-2-00-quantizer-32.webm
-659a2e6dd02df323f62600626859006640b445df  vp90-2-00-quantizer-32.webm.md5
-8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc  vp90-2-00-quantizer-33.webm
-5b175ef1120ddeba4feae1247bf381bbc4e816ce  vp90-2-00-quantizer-33.webm.md5
-4d283755d17e287b1d099a80604398f60d7fb6ea  vp90-2-00-quantizer-34.webm
-22a739de95acfeb27524e3700b8f678a9ad744d8  vp90-2-00-quantizer-34.webm.md5
-4296f56a892a412d3d4f64824718dd566c4e6459  vp90-2-00-quantizer-35.webm
-c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7  vp90-2-00-quantizer-35.webm.md5
-6f54e11da461e4410dd9075b015e2d9bc1d07dfb  vp90-2-00-quantizer-36.webm
-0b3573f5addea4e3eb11a0b85f068299d5bdad78  vp90-2-00-quantizer-36.webm.md5
-210581682a26c2c4375efc785c36e07539888bc2  vp90-2-00-quantizer-37.webm
-2b4fb6f8ba975237858e61cc8f560bcfc87cb38e  vp90-2-00-quantizer-37.webm.md5
-a15ef31283dfc4860f837fe200eb32a445f59629  vp90-2-00-quantizer-38.webm
-fb76771f3a795054b9936f70da7505c3ac585284  vp90-2-00-quantizer-38.webm.md5
-1df8433a441412831daae6726df89fa70d21b14d  vp90-2-00-quantizer-39.webm
-39e162c09a20e7e684868097766347014371fee6  vp90-2-00-quantizer-39.webm.md5
-5330e4788ab9129dbb25a7a7d5411104521248b6  vp90-2-00-quantizer-40.webm
-872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1  vp90-2-00-quantizer-40.webm.md5
-d88d03b982889e399a78d7a06eeb1cf30e6c2da2  vp90-2-00-quantizer-41.webm
-5b4f7217e57fa2a221011d0b32f8d0409496b7b6  vp90-2-00-quantizer-41.webm.md5
-9e16406e3e26955a6e17d455ef1ef64bbfa26e53  vp90-2-00-quantizer-42.webm
-0219d090cf37daabe19256ba8e932ba4874b92e4  vp90-2-00-quantizer-42.webm.md5
-a9b15843486fb05f8cd15437ef279782a42b75db  vp90-2-00-quantizer-43.webm
-3c9b0b4c607f9579a31726bfcf56729334ddc686  vp90-2-00-quantizer-43.webm.md5
-1dbc931ac446c91eabe7213efff55b596cccf07c  vp90-2-00-quantizer-44.webm
-73bc8f675103abaef3d9f73a2742b3bffd726d23  vp90-2-00-quantizer-44.webm.md5
-7c6c1be15beb9d6201204b018966c8c4f9777efc  vp90-2-00-quantizer-45.webm
-c907b29da821f790c6748de61f592689312e4e36  vp90-2-00-quantizer-45.webm.md5
-07b434da1a467580f73b32177ee11b3e00f65a0d  vp90-2-00-quantizer-46.webm
-7b2b7ce60c50bc970bc0ada46d7a7ce440148da3  vp90-2-00-quantizer-46.webm.md5
-233d0465fb1a6fa36e9f89bd2193ac79bd4d2809  vp90-2-00-quantizer-47.webm
-527e0a9fb932efe915027ffe077f9e8d3a4fb139  vp90-2-00-quantizer-47.webm.md5
-719613df7307e205c3fdb6acfb373849c5ab23c7  vp90-2-00-quantizer-48.webm
-65ab6c9d1b682c183b201c7ff42b90343ce3e304  vp90-2-00-quantizer-48.webm.md5
-3bf04a598325ed0eabae1598ec7f718f715ec672  vp90-2-00-quantizer-49.webm
-ac68c4387ce11fcc998d8ba455ab9b2bb361d240  vp90-2-00-quantizer-49.webm.md5
-d59238fb3a654931c9b65a11e7321b40d1f702e9  vp90-2-00-quantizer-50.webm
-d0576bfede46fd55659f028f2fd28554ceb3e6cc  vp90-2-00-quantizer-50.webm.md5
-3f579785101d4209360dd96f8c2ffe9beddf3bee  vp90-2-00-quantizer-51.webm
-89fcfe04f4457a7f02ab4a2f94aacbb88aee5789  vp90-2-00-quantizer-51.webm.md5
-28be5836e2fedefe4babf12fc9b79e460ab0a0f4  vp90-2-00-quantizer-52.webm
-f3dd52b70c18345fee740220f35da9c4def2017a  vp90-2-00-quantizer-52.webm.md5
-488ad4058c17170665b6acd1021fade9a02771e4  vp90-2-00-quantizer-53.webm
-1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7  vp90-2-00-quantizer-53.webm.md5
-682978289cb28cc8c9d39bc797300e45d6039de7  vp90-2-00-quantizer-54.webm
-36c35353f2c03cb099bd710d9994de7d9ed88834  vp90-2-00-quantizer-54.webm.md5
-c398ce49af762a48f10cc4da9fae0769aae5f226  vp90-2-00-quantizer-55.webm
-2cf3570542d984f167ab087f59493c7fb47e0ed2  vp90-2-00-quantizer-55.webm.md5
-3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3  vp90-2-00-quantizer-56.webm
-d3f93f8272b6de31cffb011a26f11abb514efb12  vp90-2-00-quantizer-56.webm.md5
-f4e8e14b1f278801a7eb6f11734780a01b1668e9  vp90-2-00-quantizer-57.webm
-6478fdf1d7faf6db5f19dffc5e1363af358699ee  vp90-2-00-quantizer-57.webm.md5
-307dc264f57cc618fff211fa44d7f52767ed9660  vp90-2-00-quantizer-58.webm
-cf231d4a52d492fa692ea4194ec5eb7511fec54e  vp90-2-00-quantizer-58.webm.md5
-1fd7cd596170afce2de0b1441b7674bda5723440  vp90-2-00-quantizer-59.webm
-4681f7ef96f63e085c41bb1a964b0df7e67e0b38  vp90-2-00-quantizer-59.webm.md5
-34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62  vp90-2-00-quantizer-60.webm
-58691ef53b6b623810e2c57ded374c77535df935  vp90-2-00-quantizer-60.webm.md5
-e6e812406aab81021bb16e772c1db03f75906cb6  vp90-2-00-quantizer-61.webm
-76436eace62f08ff92b61a0845e66667a027db1b  vp90-2-00-quantizer-61.webm.md5
-84d811bceed70c950a6a08e572a6e274866e72b1  vp90-2-00-quantizer-62.webm
-2d937cc011eeddd95222b960982da5cd18db580f  vp90-2-00-quantizer-62.webm.md5
-0912b295ba0ea09359315315ffd67d22d046f883  vp90-2-00-quantizer-63.webm
-5a829031055d70565f57dbcd47a6ac33619952b3  vp90-2-00-quantizer-63.webm.md5
-0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00  vp90-2-01-sharpness-1.webm
-5a0476be4448bae8f8ca17ea236c98793a755948  vp90-2-01-sharpness-1.webm.md5
-51e02d7911810cdf5be8b68ac40aedab479a3179  vp90-2-01-sharpness-2.webm
-a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6  vp90-2-01-sharpness-2.webm.md5
-0603f8ad239c07a531d948187f4dafcaf51eda8d  vp90-2-01-sharpness-3.webm
-3af8000a69c72fe77881e3176f026c2affb78cc7  vp90-2-01-sharpness-3.webm.md5
-4ca4839f48146252fb261ed88838d80211804841  vp90-2-01-sharpness-4.webm
-08832a1494f84fa9edd40e080bcf2c0e80100c76  vp90-2-01-sharpness-4.webm.md5
-95099dc8f9cbaf9b9a7dd65311923e441ff70731  vp90-2-01-sharpness-5.webm
-93ceee30c140f0b406726c0d896b9db6031c4c7f  vp90-2-01-sharpness-5.webm.md5
-ceb4116fb7b078d266d153233b6d62a255a34e4c  vp90-2-01-sharpness-6.webm
-da83efe59e537ce538e8b03a6eac63cf25849c9a  vp90-2-01-sharpness-6.webm.md5
-b5f7cd19aece3880f9d616a778e5cc24c6b9b505  vp90-2-01-sharpness-7.webm
-2957408d20deac8633941a2169f801bae6f086e1  vp90-2-01-sharpness-7.webm.md5
-ffc096c2ce1050450ad462b5fabd2a5220846319  vp90-2-02-size-08x08.webm
-e36d2ed6fa2746347710b750586aafa6a01ff3ae  vp90-2-02-size-08x08.webm.md5
-895b986f9fd55cd879472b31c6a06b82094418c8  vp90-2-02-size-08x10.webm
-079157a19137ccaebba606f2871f45a397347150  vp90-2-02-size-08x10.webm.md5
-1c5992203e62a2b83040ccbecd748b604e19f4c0  vp90-2-02-size-08x16.webm
-9aa45ffdf2078f883bbed01450031b691819c144  vp90-2-02-size-08x16.webm.md5
-d0a8953da1f85f484487408fee5da9e2a8391901  vp90-2-02-size-08x18.webm
-59a5cc17d354c6a23e5e959d666b1456a5d49c56  vp90-2-02-size-08x18.webm.md5
-1b13461a9fc65cb041bacfe4ea6f02d363397d61  vp90-2-02-size-08x32.webm
-2bdddd6878f05d37d84cde056a3f5e7f926ba3d6  vp90-2-02-size-08x32.webm.md5
-2861f0a0daadb62295b0504a1fbe5b50c79a8f59  vp90-2-02-size-08x34.webm
-6b5812cfb8a82d378ea2913bf009e93668020147  vp90-2-02-size-08x34.webm.md5
-02f948216d4246579dc53c47fe55d8fb264ba251  vp90-2-02-size-08x64.webm
-84b55fdee6d9aa820c7a8c62822446184b191767  vp90-2-02-size-08x64.webm.md5
-4b011242cbf42516efd2b197baebb61dd34562c9  vp90-2-02-size-08x66.webm
-6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b  vp90-2-02-size-08x66.webm.md5
-4057796be9dd12df48ab607f502ae6aa70eeeab6  vp90-2-02-size-10x08.webm
-71c752c51aec9f48de286b93f4c20e9c11cad7d0  vp90-2-02-size-10x08.webm.md5
-6583c853fa43fc53d51743eac5f3a43a359d45d0  vp90-2-02-size-10x10.webm
-1da524d24af1944b671d4d3f2b398d6e336584c3  vp90-2-02-size-10x10.webm.md5
-ba442fc03ccd3a705c64c83b36f5ada67d198874  vp90-2-02-size-10x16.webm
-7cfd960f232c34c641a4a2a9411b6fd0efb2fc50  vp90-2-02-size-10x16.webm.md5
-cc92ed40eef14f52e4d080cb2c57939dd8326374  vp90-2-02-size-10x18.webm
-db5626275cc55ce970b91c995e74f6838d943aca  vp90-2-02-size-10x18.webm.md5
-3a93d501d22325e9fd4c9d8b82e2a432de33c351  vp90-2-02-size-10x32.webm
-5cae51b0c71cfc131651f345f87583eb2903afaf  vp90-2-02-size-10x32.webm.md5
-50d2f2b15a9a5178153db44a9e03aaf32b227f67  vp90-2-02-size-10x34.webm
-bb0efe058122641e7f73e94497dda2b9e6c21efd  vp90-2-02-size-10x34.webm.md5
-01624ec173e533e0b33fd9bdb91eb7360c7c9175  vp90-2-02-size-10x64.webm
-b9c0e3b054463546356acf5157f9be92fd34732f  vp90-2-02-size-10x64.webm.md5
-2942879baf1c09e96b14d0fc84806abfe129c706  vp90-2-02-size-10x66.webm
-bab5f539c2f91952e187456b4beafbb4c01e25ee  vp90-2-02-size-10x66.webm.md5
-88d2b63ca5e9ee163d8f20e8886f3df3ff301a66  vp90-2-02-size-16x08.webm
-7f48a0fcf8c25963f3057d7f6669c5f2415834b8  vp90-2-02-size-16x08.webm.md5
-59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f  vp90-2-02-size-16x10.webm
-73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81  vp90-2-02-size-16x10.webm.md5
-066834fef9cf5b9a72932cf4dea5f253e14a976d  vp90-2-02-size-16x16.webm
-faec542f52f37601cb9c480d887ae9355be99372  vp90-2-02-size-16x16.webm.md5
-195307b4eb3192271ee4a935b0e48deef0c54cc2  vp90-2-02-size-16x18.webm
-5a92e19e624c0376321d4d0e22c0c91995bc23e1  vp90-2-02-size-16x18.webm.md5
-14f3f884216d7ae16ec521f024a2f2d31bbf9c1a  vp90-2-02-size-16x32.webm
-ea622d1c817dd174556f7ee7ccfe4942b34d4845  vp90-2-02-size-16x32.webm.md5
-2e0501100578a5da9dd47e4beea160f945bdd1ba  vp90-2-02-size-16x34.webm
-1b8645ef64239334921c5f56b24ce815e6070b05  vp90-2-02-size-16x34.webm.md5
-89a6797fbebebe93215f367229a9152277f5dcfe  vp90-2-02-size-16x64.webm
-a03d8c1179ca626a8856fb416d635dbf377979cd  vp90-2-02-size-16x64.webm.md5
-0f3a182e0750fcbae0b9eae80c7a53aabafdd18d  vp90-2-02-size-16x66.webm
-8cb6736dc2d897c1283919a32068af377d66c59c  vp90-2-02-size-16x66.webm.md5
-68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1  vp90-2-02-size-18x08.webm
-874c7fb505be9db3160c57cb405c4dbd5b990dc2  vp90-2-02-size-18x08.webm.md5
-0546352dd78496d4dd86c3727ac2ff36c9e72032  vp90-2-02-size-18x10.webm
-1d80eb36557ea5f25a386495a36f93da0f25316b  vp90-2-02-size-18x10.webm.md5
-60fe99e5f5cc99706efa3e0b894e45cbcf0d6330  vp90-2-02-size-18x16.webm
-1ab6cdd89a53662995d103546e6611c84f9292ab  vp90-2-02-size-18x16.webm.md5
-f9a8f5fb749d69fd555db6ca093b7f77800c7b4f  vp90-2-02-size-18x18.webm
-ace8a66328f7802b15f9989c2720c029c6abd279  vp90-2-02-size-18x18.webm.md5
-a197123a527ec25913a9bf52dc8c347749e00045  vp90-2-02-size-18x32.webm
-34fbd7036752232d1663e70d7f7cdc93f7129202  vp90-2-02-size-18x32.webm.md5
-f219655a639a774a2c9c0a9f45c28dc0b5e75e24  vp90-2-02-size-18x34.webm
-2c4d622a9ea548791c1a07903d3702e9774388bb  vp90-2-02-size-18x34.webm.md5
-5308578da48c677d477a5404e19391d1303033c9  vp90-2-02-size-18x64.webm
-e7fd4462527bac38559518ba80e41847db880f15  vp90-2-02-size-18x64.webm.md5
-e109a7e013bd179f97e378542e1e81689ed06802  vp90-2-02-size-18x66.webm
-45c04e422fb383c1f3be04beefaa4490e83bdb1a  vp90-2-02-size-18x66.webm.md5
-38844cae5d99caf445f7de33c3ae78494ce36c01  vp90-2-02-size-32x08.webm
-ad018be39e493ca2405225034b1a5b7a42af6f3a  vp90-2-02-size-32x08.webm.md5
-7b57eaad55906f9de9903c8657a3fcb2aaf792ea  vp90-2-02-size-32x10.webm
-2294425d4e55d275af5e25a0beac9738a1b4ee73  vp90-2-02-size-32x10.webm.md5
-f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1  vp90-2-02-size-32x16.webm
-ae10981d93913f0ab1f28c1146255e01769aa8c0  vp90-2-02-size-32x16.webm.md5
-08b23ad838b6cf1fbfe3ad7e7775d95573e815fc  vp90-2-02-size-32x18.webm
-1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8  vp90-2-02-size-32x18.webm.md5
-d5b88ae6c8c25c53dee74d9f1e6ca64244349a57  vp90-2-02-size-32x32.webm
-e39c067a8ee2da52a51641eb1cb7f8eba935eb6b  vp90-2-02-size-32x32.webm.md5
-529429920dc36bd899059fa75a767f02c8c60874  vp90-2-02-size-32x34.webm
-56888e7834f52b106e8911e3a7fc0f473b609995  vp90-2-02-size-32x34.webm.md5
-38e848e160391c2b1a55040aadde613b9f4bf15e  vp90-2-02-size-32x64.webm
-8950485fb3f68b0e8be234db860e4ec5f5490fd0  vp90-2-02-size-32x64.webm.md5
-5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94  vp90-2-02-size-32x66.webm
-225df9d7d72ec711b0b60f4aeb65311c97db054a  vp90-2-02-size-32x66.webm.md5
-695f929e2ce6fb11a1f180322d46c5cb1c97fa61  vp90-2-02-size-34x08.webm
-5bb4262030018dd01883965c6aa6070185924ef6  vp90-2-02-size-34x08.webm.md5
-5adf74ec906d2ad3f7526e06bd29f5ad7d966a90  vp90-2-02-size-34x10.webm
-71c100b437d3e8701632ae8d65c3555339b1c68f  vp90-2-02-size-34x10.webm.md5
-d0918923c987fba2d00193d83797b21289fe54aa  vp90-2-02-size-34x16.webm
-5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d  vp90-2-02-size-34x16.webm.md5
-553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd  vp90-2-02-size-34x18.webm
-a164c7f3c424987df2340496e6a8cf76e973f0f1  vp90-2-02-size-34x18.webm.md5
-baf3e233634f150de81c18ba5d8848068e1c3c54  vp90-2-02-size-34x32.webm
-22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03  vp90-2-02-size-34x32.webm.md5
-6d50a533774a7167350e4a7ef43c94a5622179a2  vp90-2-02-size-34x34.webm
-0c099638e79c273546523e06704553e42eb00b00  vp90-2-02-size-34x34.webm.md5
-698cdd0a5e895cc202c488675e682a8c537ede4f  vp90-2-02-size-34x64.webm
-9317b63987cddab8389510a27b86f9f3d46e3fa5  vp90-2-02-size-34x64.webm.md5
-4b5335ca06f082b6b69f584eb8e7886bdcafefd3  vp90-2-02-size-34x66.webm
-e18d68b35428f46a84a947c646804a51ef1d7cec  vp90-2-02-size-34x66.webm.md5
-a54ae7b494906ec928a876e8290e5574f2f9f6a2  vp90-2-02-size-64x08.webm
-87f9f7087b6489d45e9e4b38ede2c5aef4a4928f  vp90-2-02-size-64x08.webm.md5
-24522c70804a3c23d937df2d829ae63965b23f38  vp90-2-02-size-64x10.webm
-447ce03938ab53bffcb4a841ee0bfaa90462dcb9  vp90-2-02-size-64x10.webm.md5
-2a5035d035d214ae614af8051930690ef623989b  vp90-2-02-size-64x16.webm
-84e355761dd2e0361b904c84c52a0dd0384d89cf  vp90-2-02-size-64x16.webm.md5
-3a293ef4e270a19438e59b817fbe5f43eed4d36b  vp90-2-02-size-64x18.webm
-666824e5ba746779eb46079e0631853dcc86d48b  vp90-2-02-size-64x18.webm.md5
-ed32fae837095c9e8fc95d223ec68101812932c2  vp90-2-02-size-64x32.webm
-97086eadedce1d0d9c072b585ba7b49aec69b1e7  vp90-2-02-size-64x32.webm.md5
-696c7a7250bdfff594f4dfd88af34239092ecd00  vp90-2-02-size-64x34.webm
-253a1d38d452e7826b086846c6f872f829c276bb  vp90-2-02-size-64x34.webm.md5
-fc508e0e3c2e6872c60919a60b812c5232e9c2b0  vp90-2-02-size-64x64.webm
-2cd6ebeca0f82e9f505616825c07950371b905ab  vp90-2-02-size-64x64.webm.md5
-0f8a4fc1d6521187660425c283f08dff8c66e476  vp90-2-02-size-64x66.webm
-5806be11a1d346be235f88d3683e69f73746166c  vp90-2-02-size-64x66.webm.md5
-273b0c36e3658685cde250408a478116d7ae92f1  vp90-2-02-size-66x08.webm
-23c3cd0dca20a2f71f036e77ea92025ff4e7a298  vp90-2-02-size-66x08.webm.md5
-4844c59c3306d1e671bb0568f00e344bf797e66e  vp90-2-02-size-66x10.webm
-e041eaf6841d775f8fde8bbb4949d2733fdaab7f  vp90-2-02-size-66x10.webm.md5
-bdf3f1582b234fcd2805ffec59f9d716a2345302  vp90-2-02-size-66x16.webm
-2ec85ee18119e6798968571ea6e1b93ca386e3af  vp90-2-02-size-66x16.webm.md5
-0acce9af12b13b025d5274013da7ef6f568f075f  vp90-2-02-size-66x18.webm
-77c4d53e2a5c96b70af9d575fe6811e0f5ee627b  vp90-2-02-size-66x18.webm.md5
-682b36a25774bbdedcd603f504d18eb63f0167d4  vp90-2-02-size-66x32.webm
-53728fae2a428f16d376a29f341a64ddca97996a  vp90-2-02-size-66x32.webm.md5
-e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6  vp90-2-02-size-66x34.webm
-f69a6a555e3f614b0a35f9bfc313d8ebb35bc725  vp90-2-02-size-66x34.webm.md5
-4151b8c29452d5c2266397a7b9bf688899a2937b  vp90-2-02-size-66x64.webm
-69486e7fd9e380b6c97a03d3e167affc79f73840  vp90-2-02-size-66x64.webm.md5
-68784a1ecac776fe2a3f230345af32f06f123536  vp90-2-02-size-66x66.webm
-7f008c7f48d55e652fbd6bac405b51e0015c94f2  vp90-2-02-size-66x66.webm.md5
-7e1bc449231ac1c5c2a11c9a6333b3e828763798  vp90-2-03-size-196x196.webm
-6788a561466dace32d500194bf042e19cccc35e1  vp90-2-03-size-196x196.webm.md5
-a170c9a88ec1dd854c7a471ff55fb2a97ac31870  vp90-2-03-size-196x198.webm
-6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2  vp90-2-03-size-196x198.webm.md5
-68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f  vp90-2-03-size-196x200.webm
-bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e  vp90-2-03-size-196x200.webm.md5
-fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1  vp90-2-03-size-196x202.webm
-158ee72af578f39aad0c3b8f4cbed2fc78b57e0f  vp90-2-03-size-196x202.webm.md5
-dd28fb7247af534bdf5e6795a3ac429610489a0b  vp90-2-03-size-196x208.webm
-7546be847efce2d1c0a23f807bfb03f91b764e1e  vp90-2-03-size-196x208.webm.md5
-41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8  vp90-2-03-size-196x210.webm
-9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1  vp90-2-03-size-196x210.webm.md5
-5007bc618143437c009d6dde5fc2e86f72d37dc2  vp90-2-03-size-196x224.webm
-858361d8f79b44df5545feabbc9754ec9ede632f  vp90-2-03-size-196x224.webm.md5
-0bcbe357fbc776c3fa68e7117179574ed7564a44  vp90-2-03-size-196x226.webm
-72006a5f42031a43d70a2cd9fc1958962a86628f  vp90-2-03-size-196x226.webm.md5
-000239f048cceaac055558e97ef07078ebf65502  vp90-2-03-size-198x196.webm
-2d6841901b72000c5340f30be602853438c1b787  vp90-2-03-size-198x196.webm.md5
-ae75b766306a6404c3b3b35a6b6d53633c14fbdb  vp90-2-03-size-198x198.webm
-3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1  vp90-2-03-size-198x198.webm.md5
-95ffd573fa84ccef1cd59e1583e6054f56a5c83d  vp90-2-03-size-198x200.webm
-5d537e3c9b9c54418c79677543454c4cda3de1af  vp90-2-03-size-198x200.webm.md5
-ecc845bf574375f469bc91bf5c75c79dc00073d6  vp90-2-03-size-198x202.webm
-1b59f5e111265615a7a459eeda8cc9045178d228  vp90-2-03-size-198x202.webm.md5
-432fb27144fe421b9f51cf44d2750a26133ed585  vp90-2-03-size-198x208.webm
-a58a67f4fb357c73ca078aeecbc0f782975630b1  vp90-2-03-size-198x208.webm.md5
-ff5058e7e6a47435046612afc8536f2040989e6f  vp90-2-03-size-198x210.webm
-18d3be7935e52217e2e9400b6f2c681a9e45dc89  vp90-2-03-size-198x210.webm.md5
-a0d55263c1ed2c03817454dd4ec4090d36dbc864  vp90-2-03-size-198x224.webm
-efa366a299817e2da51c00623b165aab9fbb8d91  vp90-2-03-size-198x224.webm.md5
-ccd142fa2920fc85bb753f049160c1c353ad1574  vp90-2-03-size-198x226.webm
-534524a0b2dbff852e0b92ef09939db072f83243  vp90-2-03-size-198x226.webm.md5
-0d483b94ed40abc8ab6e49f960432ee54ad9c7f1  vp90-2-03-size-200x196.webm
-41795f548181717906e7a504ba551f06c32102ae  vp90-2-03-size-200x196.webm.md5
-f6c2dc54e0989d50f01333fe40c91661fcbf849a  vp90-2-03-size-200x198.webm
-43df5d8c46a40089441392e6d096c588c1079a68  vp90-2-03-size-200x198.webm.md5
-2f6e9df82e44fc145f0d9212dcccbed3de605e23  vp90-2-03-size-200x200.webm
-757b2ef96b82093255725bab9690bbafe27f3caf  vp90-2-03-size-200x200.webm.md5
-40c5ea60415642a4a2e75c0d127b06309baadfab  vp90-2-03-size-200x202.webm
-3022c4a1c625b5dc04fdb1052d17d45b4171cfba  vp90-2-03-size-200x202.webm.md5
-6942ed5b27476bb8506d10e600d6ff60887780ca  vp90-2-03-size-200x208.webm
-c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be  vp90-2-03-size-200x208.webm.md5
-71dbc99b83c49d1da45589b91eabb98e2f4a7b1e  vp90-2-03-size-200x210.webm
-3f0b40da7eef7974b9bc326562f251feb67d9c7c  vp90-2-03-size-200x210.webm.md5
-6b6b8489081cfefb377cc5f18eb754ec2383f655  vp90-2-03-size-200x224.webm
-a259df2ac0e294492e3f9d4315baa34cab044f04  vp90-2-03-size-200x224.webm.md5
-c9adc1c9bb07559349a0b054df4af56f7a6edbb9  vp90-2-03-size-200x226.webm
-714cec61e3575581e4f1a0e3921f4dfdbbd316c5  vp90-2-03-size-200x226.webm.md5
-f9bdc936bdf53f8be9ce78fecd41a21d31ff3943  vp90-2-03-size-202x196.webm
-5b8e2e50fcea2c43b12fc067b8a9cc117af77bda  vp90-2-03-size-202x196.webm.md5
-c7b66ea3da87613deb47ff24a111247d3c384fec  vp90-2-03-size-202x198.webm
-517e91204b25586da943556f4adc5951c9be8bee  vp90-2-03-size-202x198.webm.md5
-935ef56b01cfdb4265a7e24696645209ccb20970  vp90-2-03-size-202x200.webm
-55b8ec4a2513183144a8e27564596c06c7576fce  vp90-2-03-size-202x200.webm.md5
-849acf75e4f1d8d90046704e1103a18c64f30e35  vp90-2-03-size-202x202.webm
-c79afc6660df2824e7df314e5bfd71f0d8acf76b  vp90-2-03-size-202x202.webm.md5
-17b3a4d55576b770626ccb856b9f1a6c8f6ae476  vp90-2-03-size-202x208.webm
-0b887ff30409c58f2ccdc3bfacd6be7c69f8997a  vp90-2-03-size-202x208.webm.md5
-032d0ade4230fb2eef6d19915a7a1c9aa4a52617  vp90-2-03-size-202x210.webm
-f78f8e79533c0c88dd2bfdcec9b1c07848568ece  vp90-2-03-size-202x210.webm.md5
-915a38c31fe425d5b93c837121cfa8082f5ea5bc  vp90-2-03-size-202x224.webm
-bf52a104074d0c5942aa7a5b31e11db47e43d48e  vp90-2-03-size-202x224.webm.md5
-be5cfde35666fa435e47d544d9258215beb1cf29  vp90-2-03-size-202x226.webm
-2fa2f87502fda756b319389c8975204e130a2e3f  vp90-2-03-size-202x226.webm.md5
-15d908e97862b5b4bf295610df011fb9aa09909b  vp90-2-03-size-208x196.webm
-50c60792305d6a99be376dd596a6ff979325e6cc  vp90-2-03-size-208x196.webm.md5
-a367c7bc9fde56d6f4848cc573c7d4c1ce75e348  vp90-2-03-size-208x198.webm
-be85fb2c8d435a75484231356f07d06ebddd13cd  vp90-2-03-size-208x198.webm.md5
-05fd46deb7288e7253742091f56e54a9a441a187  vp90-2-03-size-208x200.webm
-74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c  vp90-2-03-size-208x200.webm.md5
-d8985c4b386513a7385a4b3639bf91e469f1378b  vp90-2-03-size-208x202.webm
-0614a1e8d92048852adcf605a51333f5fabc7f03  vp90-2-03-size-208x202.webm.md5
-28b002242238479165ba4fb87ee6b442c64b32e4  vp90-2-03-size-208x208.webm
-37de5aca59bb900228400b0e115d3229edb9dcc0  vp90-2-03-size-208x208.webm.md5
-c545be0050c2fad7c68427dbf86c62a739e94ab3  vp90-2-03-size-208x210.webm
-d646eccb3cd578f94b54777e32b88898bef6e17a  vp90-2-03-size-208x210.webm.md5
-63a0cfe295b661026dd7b1bebb67acace1db766f  vp90-2-03-size-208x224.webm
-85c0361d93bf85a335248fef2767ff43eeef23db  vp90-2-03-size-208x224.webm.md5
-f911cc718d66e4fe8a865226088939c9eb1b7825  vp90-2-03-size-208x226.webm
-a6d583a57876e7b7ec48625b2b2cdbcf70cab837  vp90-2-03-size-208x226.webm.md5
-5bbb0f36da9a4683cf04e724124d8696332911bf  vp90-2-03-size-210x196.webm
-a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d  vp90-2-03-size-210x196.webm.md5
-8db64d6f9ce36dd382013b42ae4e292deba697bc  vp90-2-03-size-210x198.webm
-eda20f8268c7f4147bead4059e9c4897e09140a9  vp90-2-03-size-210x198.webm.md5
-ce391505eeaf1d12406563101cd6b2dbbbb44bfc  vp90-2-03-size-210x200.webm
-79d73b7f623082d2a00aa33e95c79d11c7d9c3a8  vp90-2-03-size-210x200.webm.md5
-852db6fdc206e72391fc69b807f1954934679949  vp90-2-03-size-210x202.webm
-f69414c5677ed2f2b8b37ae76429e509a92276a5  vp90-2-03-size-210x202.webm.md5
-c424cc3edd2308da7d33f27acb36b54db5bf2595  vp90-2-03-size-210x208.webm
-27b18562faa1b3184256f4eae8114b539b3e9d3e  vp90-2-03-size-210x208.webm.md5
-dd029eba719d50a2851592fa8b9b2efe88904930  vp90-2-03-size-210x210.webm
-c853a1670465eaa04ca31b3511995f1b6ed4f58f  vp90-2-03-size-210x210.webm.md5
-d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3  vp90-2-03-size-210x224.webm
-93b793e79d987065b39ad8e2e71244368435fc25  vp90-2-03-size-210x224.webm.md5
-3d0825fe83bcc125be1f78145ff43ca6d7588784  vp90-2-03-size-210x226.webm
-5230f31a57ca3b5311698a12035d2644533b3ec4  vp90-2-03-size-210x226.webm.md5
-6622f8bd9279e1ce45509a58a31a990052d45e14  vp90-2-03-size-224x196.webm
-65411da07f60113f2be05c807879072b161d561e  vp90-2-03-size-224x196.webm.md5
-6744ff2ee2c41eb08c62ff30880833b6d77b585b  vp90-2-03-size-224x198.webm
-46ea3641d41acd4bff347b224646c060d5620385  vp90-2-03-size-224x198.webm.md5
-8eb91f3416a1404705f370caecd74b2b458351b1  vp90-2-03-size-224x200.webm
-196aefb854c8b95b9330263d6690b7ee15693ecf  vp90-2-03-size-224x200.webm.md5
-256a5a23ef4e6d5ef2871af5afb8cd13d28cec00  vp90-2-03-size-224x202.webm
-840ad8455dcf2be378c14b007e66fa642fc8196d  vp90-2-03-size-224x202.webm.md5
-db4606480ab48b96c9a6ff5e639f1f1aea2a12e4  vp90-2-03-size-224x208.webm
-40b9801d5620467499ac70fa6b7c40aaa5e1c331  vp90-2-03-size-224x208.webm.md5
-e37159e687fe1cb24cffddfae059301adbaf4212  vp90-2-03-size-224x210.webm
-1e4acd4b6334ae260c3eed08652d0ba8122073f2  vp90-2-03-size-224x210.webm.md5
-0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130  vp90-2-03-size-224x224.webm
-37db449ad86fb286c2c02d94aa8fe0379c05044a  vp90-2-03-size-224x224.webm.md5
-32ebbf903a7d7881bcfe59639f1d472371f3bf27  vp90-2-03-size-224x226.webm
-5cc3ac5dc9f6912491aa2ddac863f8187f34c569  vp90-2-03-size-224x226.webm.md5
-9480ff5c2c32b1870ac760c87514912616e6cf01  vp90-2-03-size-226x196.webm
-fe83655c0f1888f0af7b047785f01ba7ca9f1324  vp90-2-03-size-226x196.webm.md5
-09cad4221996315cdddad4e502dbfabf53ca1d6a  vp90-2-03-size-226x198.webm
-e3ddfdc650acb95adb45abd9b634e1f09ea8ac96  vp90-2-03-size-226x198.webm.md5
-c34f49d55fe39e3f0b607e3cc95e30244225cecb  vp90-2-03-size-226x200.webm
-abb83edc868a3523ccd4e5523fac2efbe7c3df1f  vp90-2-03-size-226x200.webm.md5
-d17bc08eedfc60c4c23d576a6c964a21bf854d1f  vp90-2-03-size-226x202.webm
-1d22d2d0f375251c2d5a1acb4714bc35d963865b  vp90-2-03-size-226x202.webm.md5
-9bd537c4f92a25596ccd29fedfe181feac948b92  vp90-2-03-size-226x208.webm
-6feb0e7325386275719f3511ada9e248a2ae7df4  vp90-2-03-size-226x208.webm.md5
-4487067f6cedd495b93696b44b37fe0a3e7eda14  vp90-2-03-size-226x210.webm
-49a8fa87945f47208168d541c068e78d878075d5  vp90-2-03-size-226x210.webm.md5
-559fea2f8da42b33c1aa1dbc34d1d6781009847a  vp90-2-03-size-226x224.webm
-83c6d8f2969b759e10e5c6542baca1265c874c29  vp90-2-03-size-226x224.webm.md5
-fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce  vp90-2-03-size-226x226.webm
-94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
-b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
-65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
-4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba  vp90-2-05-resize.ivf
-7f6d8879336239a43dbb6c9f13178cb11cf7ed09  vp90-2-05-resize.ivf.md5
-bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe  vp90-2-06-bilinear.webm
-f6235f937552e11d8eb331ec55da6b3aa596b9ac  vp90-2-06-bilinear.webm.md5
-0c83a1e414fde3bccd6dc451bbaee68e59974c76  vp90-2-07-frame_parallel.webm
-e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd  vp90-2-07-frame_parallel.webm.md5
-086c7edcffd699ae7d99d710fd7e53b18910ca5b  vp90-2-08-tile_1x2_frame_parallel.webm
-e981ecaabb29a80e0cbc1f4002384965ce8e95bb  vp90-2-08-tile_1x2_frame_parallel.webm.md5
-ed79be026a6f28646c5825da1c12d1fbc70f96a4  vp90-2-08-tile_1x2.webm
-45b404e025841c9750895fc1a9f6bd384fe6a315  vp90-2-08-tile_1x2.webm.md5
-cf8ea970c776797aae71dac8317ea926d9431cab  vp90-2-08-tile_1x4_frame_parallel.webm
-a481fbea465010b57af5a19ebf6d4a5cfe5b9278  vp90-2-08-tile_1x4_frame_parallel.webm.md5
-0203ec456277a01aec401e7fb6c72c9a7e5e3f9d  vp90-2-08-tile_1x4.webm
-c9b237dfcc01c1b414fbcaa481d014a906ef7998  vp90-2-08-tile_1x4.webm.md5
-20c75157e91ab41f82f70ffa73d5d01df8469287  vp90-2-08-tile-4x4.webm
-ae7451810247fd13975cc257aa0301ff17102255  vp90-2-08-tile-4x4.webm.md5
-2ec6e15422ac7a61af072dc5f27fcaf1942ce116  vp90-2-08-tile-4x1.webm
-0094f5ee5e46345017c30e0aa4835b550212d853  vp90-2-08-tile-4x1.webm.md5
-edea45dac4a3c2e5372339f8851d24c9bef803d6  vp90-2-09-subpixel-00.ivf
-5428efc4bf92191faedf4a727fcd1d94966a7abc  vp90-2-09-subpixel-00.ivf.md5
-8cdd435d89029987ee196896e21520e5f879f04d  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
-091b373aa2ecb59aa5c647affd5bcafcc7547364  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
-87ee28032b0963a44b73a850fcc816a6dc83efbb  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
-c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
-2064bdb22aa71c2691e0469fb62e8087a43f08f8  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
-8080eda22694910162f0996e8a962612f381a57f  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
-a484b335c27ea189c0f0d77babea4a510ce12d50  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
-3eacf1f006250be4cc5c92a7ef146e385ee62653  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
-217f089a16447490823127b36ce0d945522accfd  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
-eedb3c641e60dacbe082491a16df529a5c9187df  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
-cb7e4955af183dff33bcba0c837f0922ab066400  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
-48613f9380e2580002f8a09d6e412ea4e89a52b9  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
-990a91f24dd284562d21d714ae773dff5452cad8  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
-aa402217577a659cfc670157735b4b8e9aa670fe  vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
-b6dd558c90bca466b4bcbd03b3371648186465a7  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
-1a9c2914ba932a38f0a143efc1ad0e318e78888b  vp90-2-tos_426x178_tile_1x1_181kbps.webm
-a3d2b09f24debad4747a1b3066f572be4273bced  vp90-2-tos_640x266_tile_1x2_336kbps.webm
-c64b03b5c090e6888cb39685c31f00a6b79fa45c  vp90-2-tos_854x356_tile_1x2_656kbps.webm
-94b533dbcf94292001e27cc51fec87f9e8c90c0b  vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
-0e7cd4135b231c9cea8d76c19f9e84b6fd77acec  vp90-2-08-tile_1x8_frame_parallel.webm
-c9b6850af28579b031791066457f4cb40df6e1c7  vp90-2-08-tile_1x8_frame_parallel.webm.md5
-e448b6e83490bca0f8d58b4f4b1126a17baf4b0c  vp90-2-08-tile_1x8.webm
-5e524165f0397e6141d914f4f0a66267d7658376  vp90-2-08-tile_1x8.webm.md5
-a34e14923d6d17b1144254d8187d7f85b700a63c  vp90-2-02-size-lf-1920x1080.webm
-e3b28ddcfaeb37fb4d132b93f92642a9ad17c22d  vp90-2-02-size-lf-1920x1080.webm.md5
-d48c5db1b0f8e60521a7c749696b8067886033a3  vp90-2-09-aq2.webm
-84c1599298aac78f2fc05ae2274575d10569dfa0  vp90-2-09-aq2.webm.md5
-55fc55ed73d578ed60fad05692579873f8bad758  vp90-2-09-lf_deltas.webm
-54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
-510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
-14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
-d2feea7728e8d2c615981d0f47427a4a5a45d881  vp90-2-10-show-existing-frame2.webm
-5f7c7811baa3e4f03be1dd78c33971b727846821  vp90-2-10-show-existing-frame2.webm.md5
-b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
-b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
-8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
-19e003804ec1dfc5464813b32339a15d5ba7b42f  vp90-2-11-size-351x288.webm.md5
-40cd1d6a188d7a88b21ebac1e573d3f270ab261e  vp90-2-11-size-352x287.webm
-68f515abe3858fc1eded46c8e6b2f727d43b5331  vp90-2-11-size-352x287.webm.md5
-9a510769ff23db410880ec3029d433e87d17f7fc  vp90-2-12-droppable_1.ivf
-952eaac6eefa6f62179ed1db3e922fd42fecc624  vp90-2-12-droppable_1.ivf.md5
-9a510769ff23db410880ec3029d433e87d17f7fc  vp90-2-12-droppable_2.ivf
-92a756469fa438220524e7fa6ac1d38c89514d17  vp90-2-12-droppable_2.ivf.md5
-c21e97e4ba486520118d78b01a5cb6e6dc33e190  vp90-2-12-droppable_3.ivf
-601abc9e4176c70f82ac0381365e9b151fdd24cd  vp90-2-12-droppable_3.ivf.md5
-61c640dad23cd4f7ad811b867e7b7e3521f4e3ba  vp90-2-13-largescaling.webm
-bca1b02eebdb088fa3f389fe0e7571e75a71f523  vp90-2-13-largescaling.webm.md5
-c740708fa390806eebaf669909c1285ab464f886  vp90-2-14-resize-fp-tiles-1-2.webm
-c7b85ffd8e11500f73f52e7dc5a47f57c393d47f  vp90-2-14-resize-fp-tiles-1-2.webm.md5
-ec8faa352a08f7033c60f29f80d505e2d7daa103  vp90-2-14-resize-fp-tiles-1-4.webm
-6852c783fb421bda5ded3d4c5a3ffc46de03fbc1  vp90-2-14-resize-fp-tiles-1-4.webm.md5
-8af61853ac0d07c4cb5bf7c2016661ba350b3497  vp90-2-14-resize-fp-tiles-1-8.webm
-571353bac89fea60b5706073409aa3c0d42aefe9  vp90-2-14-resize-fp-tiles-1-8.webm.md5
-b1c187ed69931496b82ec194017a79831bafceef  vp90-2-14-resize-fp-tiles-1-16.webm
-1c199a41afe42ce303944d70089eaaa2263b4a09  vp90-2-14-resize-fp-tiles-1-16.webm.md5
-8eaae5a6f2dff934610b0c7a917d7f583ba74aa5  vp90-2-14-resize-fp-tiles-2-1.webm
-db18fcf915f7ffaea6c39feab8bda6c1688af011  vp90-2-14-resize-fp-tiles-2-1.webm.md5
-bc3046d138941e2a20e9ceec0ff6d25c25d12af3  vp90-2-14-resize-fp-tiles-4-1.webm
-393211b808030d09a79927b17a4374b2f68a60ae  vp90-2-14-resize-fp-tiles-4-1.webm.md5
-6e8f8e31721a0f7f68a2964e36e0e698c2e276b1  vp90-2-14-resize-fp-tiles-8-1.webm
-491fd3cd78fb0577bfe905bb64bbf64bd7d29140  vp90-2-14-resize-fp-tiles-8-1.webm.md5
-cc5958da2a7edf739cd2cfeb18bd05e77903087e  vp90-2-14-resize-fp-tiles-16-1.webm
-0b58daf55aaf9063bf5b4fb33393d18b417dc428  vp90-2-14-resize-fp-tiles-16-1.webm.md5
-821eeecc9d8c6a316134dd42d1ff057787d8047b  vp90-2-14-resize-fp-tiles-2-4.webm
-374c549f2839a3d0b732c4e3650700144037e76c  vp90-2-14-resize-fp-tiles-2-4.webm.md5
-dff8c8e49aacea9f4c7f22cb882da984e2a1b405  vp90-2-14-resize-fp-tiles-2-8.webm
-e5b8820a7c823b21297d6e889e57ec401882c210  vp90-2-14-resize-fp-tiles-2-8.webm.md5
-77629e4b23e32896aadf6e994c78bd4ffa1c7797  vp90-2-14-resize-fp-tiles-2-16.webm
-1937f5df032664ac345d4613ad4417b4967b1230  vp90-2-14-resize-fp-tiles-2-16.webm.md5
-380ba5702bb1ec7947697314ab0300b5c56a1665  vp90-2-14-resize-fp-tiles-4-2.webm
-fde7b30d2aa64c1e851a4852f655d79fc542cf66  vp90-2-14-resize-fp-tiles-4-2.webm.md5
-dc784b258ffa2abc2ae693d11792acf0bb9cb74f  vp90-2-14-resize-fp-tiles-8-2.webm
-edf26f0130aeee8342d49c2c8f0793ad008782d9  vp90-2-14-resize-fp-tiles-8-2.webm.md5
-8e575789fd63ebf69e8eff1b9a4351a249a73bee  vp90-2-14-resize-fp-tiles-16-2.webm
-b6415318c1c589a1f64b9d569ce3cabbec2e0d52  vp90-2-14-resize-fp-tiles-16-2.webm.md5
-e3adc944a11c4c5517e63664c84ebb0847b64d81  vp90-2-14-resize-fp-tiles-4-8.webm
-03cba0532bc90a05b1990db830bf5701e24e7982  vp90-2-14-resize-fp-tiles-4-8.webm.md5
-3b27a991eb6d78dce38efab35b7db682e8cbbee3  vp90-2-14-resize-fp-tiles-4-16.webm
-5d16b7f82bf59f802724ddfd97abb487150b1c9d  vp90-2-14-resize-fp-tiles-4-16.webm.md5
-d5fed8c28c1d4c7e232ebbd25cf758757313ed96  vp90-2-14-resize-fp-tiles-8-4.webm
-5a8ff8a52cbbde7bfab569beb6d971c5f8b904f7  vp90-2-14-resize-fp-tiles-8-4.webm.md5
-17a5faa023d77ee9dad423a4e0d3145796bbc500  vp90-2-14-resize-fp-tiles-16-4.webm
-2ef8daa3c3e750fd745130d0a76a39fe86f0448f  vp90-2-14-resize-fp-tiles-16-4.webm.md5
-9361e031f5cc990d8740863e310abb5167ae351e  vp90-2-14-resize-fp-tiles-8-16.webm
-57f13a2197486584f4e1a4f82ad969f3abc5a1a2  vp90-2-14-resize-fp-tiles-8-16.webm.md5
-5803fc6fcbfb47b7661f3fcc6499158a32b56675  vp90-2-14-resize-fp-tiles-16-8.webm
-be0fe64a1a4933696ff92d93f9bdecdbd886dc13  vp90-2-14-resize-fp-tiles-16-8.webm.md5
-0ac0f6d20a0afed77f742a3b9acb59fd7b9cb093  vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
-1765315acccfe6cd12230e731369fcb15325ebfa  vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
-4a2b7a683576fe8e330c7d1c4f098ff4e70a43a8  vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
-1ef480392112b3509cb190afbb96f9a38dd9fbac  vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
-e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
-e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
-9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
-8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-698a6910a97486b833073ef0c0b18d75dce57ee8  vp90-2-16-intra-only.webm
-5661b0168752969f055eec37b05fa9fa947dc7eb  vp90-2-16-intra-only.webm.md5
-c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa  vp90-2-17-show-existing-frame.webm
-cc75f351818b9a619818f5cc77b9bc013d0c1e11  vp90-2-17-show-existing-frame.webm.md5
-0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
-367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
-eb438c6540eb429f74404eedfa3228d409c57874  desktop_640_360_30.yuv
-89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab  kirland_640_480_30.yuv
-33c533192759e5bb4f07abfbac389dc259db4686  macmarcomoving_640_480_30.yuv
-8bfaab121080821b8f03b23467911e59ec59b8fe  macmarcostationary_640_480_30.yuv
-70894878d916a599842d9ad0dcd24e10c13e5467  niklas_640_480_30.yuv
-8784b6df2d8cc946195a90ac00540500d2e522e4  tacomanarrows_640_480_30.yuv
-edd86a1f5e62fd9da9a9d46078247759c2638009  tacomasmallcameramovement_640_480_30.yuv
-9a70e8b7d14fba9234d0e51dce876635413ce444  thaloundeskmtg_640_480_30.yuv
-e7d315dbf4f3928779e0dc624311196d44491d32  niklas_1280_720_30.yuv
-c77e4a26616add298a05dd5d12397be22c0e40c5  vp90-2-18-resize.ivf
-c12918cf0a716417fba2de35c3fc5ab90e52dfce  vp90-2-18-resize.ivf.md5
-717da707afcaa1f692ff1946f291054eb75a4f06  screendata.y4m
+d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+76024eb753cdac6a5e5703aaea189d35c3c30ac7 *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
+7448d8798a4380162d4b56f9b452e2f6f9e24e7a *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
+83f50908c8dc0ef8760595447a2ff7727489542e *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v2.webm
+25751f5d3b05ff03f0719ad42cd625348eb8961e *invalid-vp90-01-v2.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
+8e2eff4af87d2b561cce2365713269e301457ef3 *invalid-vp90-02-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res
+d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
+3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
+614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
+b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m
+5184c46ddca8b1fadd16742e8500115bc8f749da *vp80-00-comprehensive-001.ivf
+65bf1bbbced81b97bd030f376d1b7f61a224793f *vp80-00-comprehensive-002.ivf
+906b4c1e99eb734504c504b3f1ad8052137ce672 *vp80-00-comprehensive-003.ivf
+ec144b1af53af895db78355785650b96dd3f0ade *vp80-00-comprehensive-004.ivf
+afc7091785c62f1c121c4554a2830c30704587d9 *vp80-00-comprehensive-005.ivf
+42ea9d55c818145d06a9b633b8e85c6a6164fd3e *vp80-00-comprehensive-006.ivf
+e5b3a73ab79fe024c14309d653d6bed92902ee3b *vp80-00-comprehensive-007.ivf
+f3c50a58875930adfb84525c0ef59d7e4c08540c *vp80-00-comprehensive-008.ivf
+4b2841fdb83db51ae322096ae468bbb9dc2c8362 *vp80-00-comprehensive-009.ivf
+efbff736e3a91ab6a98c5bc2dce65d645944c7b1 *vp80-00-comprehensive-010.ivf
+6b315102cae008d22a3d2c231be92cb704a222f8 *vp80-00-comprehensive-011.ivf
+f3214a4fea14c2d5ec689936c1613f274c859ee8 *vp80-00-comprehensive-012.ivf
+e4094e96d308c8a35b74c480a43d853c5294cd34 *vp80-00-comprehensive-013.ivf
+5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a *vp80-00-comprehensive-014.ivf
+e8467688ddf26b5000664f904faf0d70506aa653 *vp80-00-comprehensive-015.ivf
+aab55582337dfd2a39ff54fb2576a91910d49337 *vp80-00-comprehensive-016.ivf
+1ba24724f80203c9bae4f1d0f99d534721980016 *vp80-00-comprehensive-017.ivf
+143a15512b46f436280ddb4d0e6411eb4af434f2 *vp80-00-comprehensive-018.ivf
+c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b *vp80-01-intra-1400.ivf
+f383955229afe3408453e316d11553d923ca60d5 *vp80-01-intra-1411.ivf
+84e1f4343f174c9f3c83f834bac3196fb325bf2c *vp80-01-intra-1416.ivf
+fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51 *vp80-01-intra-1417.ivf
+71ea772d3e9d315b8cbecf41207b8a237c34853b *vp80-02-inter-1402.ivf
+d85dbc4271525dcd128c503f936fe69091d1f8d0 *vp80-02-inter-1412.ivf
+d4e5d3ad56511867d025f93724d090f92ba6ec3d *vp80-02-inter-1418.ivf
+91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa *vp80-02-inter-1424.ivf
+17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02 *vp80-03-segmentation-01.ivf
+3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb *vp80-03-segmentation-02.ivf
+c156778d5340967d4b369c490848076e92f1f875 *vp80-03-segmentation-03.ivf
+d25dcff6c60e87a1af70945b8911b6b4998533b0 *vp80-03-segmentation-04.ivf
+362baba2ce454c9db21218f35e81c27a5ed0b730 *vp80-03-segmentation-1401.ivf
+d223ae7ee748ce07e74c4679bfd219e84aa9f4b0 *vp80-03-segmentation-1403.ivf
+033adf7f3a13836a3f1cffcb87c1972900f2b5c6 *vp80-03-segmentation-1407.ivf
+4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f *vp80-03-segmentation-1408.ivf
+f37a62b197c2600d75e0ccfbb31b60efdedac251 *vp80-03-segmentation-1409.ivf
+eb25bd7bfba5b2f6935018a930f42d123b1e7fcd *vp80-03-segmentation-1410.ivf
+b9d5c436663a30c27cfff84b53a002e501258843 *vp80-03-segmentation-1413.ivf
+6da92b9d1a180cc3a8afe348ab12258f5a37be1a *vp80-03-segmentation-1414.ivf
+a4f5842602886bd669f115f93d8a35c035cb0948 *vp80-03-segmentation-1415.ivf
+f295dceb8ef278b77251b3f9df8aee22e161d547 *vp80-03-segmentation-1425.ivf
+198dbf9f36f733200e432664cc8c5752d59779de *vp80-03-segmentation-1426.ivf
+7704804e32f5de976803929934a7fafe101ac7b0 *vp80-03-segmentation-1427.ivf
+831ccd862ea95ca025d2f3bd8b88678752f5416d *vp80-03-segmentation-1432.ivf
+b3c11978529289f9109f2766fcaba3ebc40e11ef *vp80-03-segmentation-1435.ivf
+a835a731f5520ebfc1002c40121264d0020559ac *vp80-03-segmentation-1436.ivf
+1d1732942f773bb2a5775fcb9689b1579ce28eab *vp80-03-segmentation-1437.ivf
+db04799adfe089dfdf74dbd43cc05ede7161f99e *vp80-03-segmentation-1441.ivf
+7caf39b3f20cfd52b998210878062e52a5edf1e6 *vp80-03-segmentation-1442.ivf
+3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261 *vp80-04-partitions-1404.ivf
+93cc323b6b6867f1b12dd48773424549c6960a6b *vp80-04-partitions-1405.ivf
+047eedb14b865bdac8a3538e63801054e0295e9c *vp80-04-partitions-1406.ivf
+0f1233bd2bc33f56ce5e495dbd455d122339f384 *vp80-05-sharpness-1428.ivf
+51767fc136488a9535c2a4c38067c542ee2048df *vp80-05-sharpness-1429.ivf
+9805aa107672de25d6fb8c35e20d06deca5efe18 *vp80-05-sharpness-1430.ivf
+61db6b965f9c27aebe71b85bf2d5877e58e4bbdf *vp80-05-sharpness-1431.ivf
+10420d266290d2923555f84af38eeb96edbd3ae8 *vp80-05-sharpness-1433.ivf
+3ed24f9a80cddfdf75824ba95cdb4ff9286cb443 *vp80-05-sharpness-1434.ivf
+c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163 *vp80-05-sharpness-1438.ivf
+aff51d865c2621b60510459244ea83e958e4baed *vp80-05-sharpness-1439.ivf
+da386e72b19b5485a6af199c5eb60ef25e510dd1 *vp80-05-sharpness-1440.ivf
+6759a095203d96ccd267ce09b1b050b8cc4c2f1f *vp80-05-sharpness-1443.ivf
+b95d3cc1d0df991e63e150a801710a72f20d9ba0 *vp80-06-smallsize.ivf
+db55ec7fd02c864ba996ff060b25b1e08611330b *vp80-00-comprehensive-001.ivf.md5
+29db0ad011cba1e45f856d5623cd38dac3e3bf19 *vp80-00-comprehensive-002.ivf.md5
+e84f258f69e173e7d68f8f8c037a0a3766902182 *vp80-00-comprehensive-003.ivf.md5
+eb7912eaf69559a16fd82bc3f5fb1524cf4a4466 *vp80-00-comprehensive-004.ivf.md5
+4206f71c94894bd5b5b376f6c09b3817dbc65206 *vp80-00-comprehensive-005.ivf.md5
+4f89b356f6f2fecb928f330a10f804f00f5325f5 *vp80-00-comprehensive-006.ivf.md5
+2813236a32964dd8007e17648bcf035a20fcda6c *vp80-00-comprehensive-007.ivf.md5
+10746c72098f872803c900e17c5680e451f5f498 *vp80-00-comprehensive-008.ivf.md5
+39a23d0692ce64421a7bb7cdf6ccec5928d37fff *vp80-00-comprehensive-009.ivf.md5
+f6e3de8931a0cc659bda8fbc14050346955e72d4 *vp80-00-comprehensive-010.ivf.md5
+101683ec195b6e944f7cd1e468fc8921439363e6 *vp80-00-comprehensive-011.ivf.md5
+1f592751ce46d8688998fa0fa4fbdcda0fd4058c *vp80-00-comprehensive-012.ivf.md5
+6066176f90ca790251e795fca1a5797d59999841 *vp80-00-comprehensive-013.ivf.md5
+2656da94ba93691f23edc4d60b3a09e2be46c217 *vp80-00-comprehensive-014.ivf.md5
+c6e0d5f5d61460c8ac8edfa4e701f10312c03133 *vp80-00-comprehensive-015.ivf.md5
+ee60fee501d8493e34e8d6a1fe315b51ed09b24a *vp80-00-comprehensive-016.ivf.md5
+9f1914ceffcad4546c0a29de3ef591d8bea304dc *vp80-00-comprehensive-017.ivf.md5
+e0305178fe288a9fd8082b39e2d03181edb19054 *vp80-00-comprehensive-018.ivf.md5
+612494da2fa799cc9d76dcdd835ae6c7cb2e5c05 *vp80-01-intra-1400.ivf.md5
+48ea06097ac8269c5e8c2131d3d0639f431fcf0e *vp80-01-intra-1411.ivf.md5
+6e2ab4e7677ad0ba868083ca6bc387ee922b400c *vp80-01-intra-1416.ivf.md5
+eca0a90348959ce3854142f8d8641b13050e8349 *vp80-01-intra-1417.ivf.md5
+920feea203145d5c2258a91c4e6991934a79a99e *vp80-02-inter-1402.ivf.md5
+f71d97909fe2b3dd65be7e1f56c72237f0cef200 *vp80-02-inter-1412.ivf.md5
+e911254569a30bbb2a237ff8b79f69ed9da0672d *vp80-02-inter-1418.ivf.md5
+58c789c50c9bb9cc90580bed291164a0939d28ba *vp80-02-inter-1424.ivf.md5
+ff3e2f441327b9c20a0b37c524e0f5a48a36de7b *vp80-03-segmentation-01.ivf.md5
+0791f417f076a542ae66fbc3426ab4d94cbd6c75 *vp80-03-segmentation-02.ivf.md5
+722e50f1a6a91c34302d68681faffc1c26d1cc57 *vp80-03-segmentation-03.ivf.md5
+c701f1885bcfb27fb8e70cc65606b289172ef889 *vp80-03-segmentation-04.ivf.md5
+f79bc9ec189a2b4807632a3d0c5bf04a178b5300 *vp80-03-segmentation-1401.ivf.md5
+b9aa4c74c0219b639811c44760d0b24cd8bb436a *vp80-03-segmentation-1403.ivf.md5
+70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334 *vp80-03-segmentation-1407.ivf.md5
+265f962ee781531f9a93b9309461316fd32b2a1d *vp80-03-segmentation-1408.ivf.md5
+0c4ecbbd6dc042d30e626d951b65f460dd6cd563 *vp80-03-segmentation-1409.ivf.md5
+cf779af36a937f06570a0fca9db64ba133451dee *vp80-03-segmentation-1410.ivf.md5
+0e6c5036d51ab078842f133934926c598a9cff02 *vp80-03-segmentation-1413.ivf.md5
+eb3930aaf229116c80d507516c34759c3f6cdf69 *vp80-03-segmentation-1414.ivf.md5
+123d6c0f72ee87911c4ae7538e87b7d163b22d6c *vp80-03-segmentation-1415.ivf.md5
+e70551d1a38920e097a5d8782390b79ecaeb7505 *vp80-03-segmentation-1425.ivf.md5
+44e8f4117e46dbb302b2cfd81171cc1a1846e431 *vp80-03-segmentation-1426.ivf.md5
+52636e54aee5f95bbace37021bd67de5db767e9a *vp80-03-segmentation-1427.ivf.md5
+b1ad3eff20215c28e295b15ef3636ed926d59cba *vp80-03-segmentation-1432.ivf.md5
+24c22a552fa28a90e5978f67f57181cc2d7546d7 *vp80-03-segmentation-1435.ivf.md5
+96c49c390abfced18a7a8c9b9ea10af778e10edb *vp80-03-segmentation-1436.ivf.md5
+f95eb6214571434f1f73ab7833b9ccdf47588020 *vp80-03-segmentation-1437.ivf.md5
+1c0700ca27c9b0090a7747a4b0b4dc21d1843181 *vp80-03-segmentation-1441.ivf.md5
+81d4f23ca32667ee958bae579c8f5e97ba72eb97 *vp80-03-segmentation-1442.ivf.md5
+272efcef07a3a30fbca51bfd566063d8258ec0be *vp80-04-partitions-1404.ivf.md5
+66ed219ab812ac801b256d35cf495d193d4cf478 *vp80-04-partitions-1405.ivf.md5
+36083f37f56f502bd60ec5e07502ee9e6b8699b0 *vp80-04-partitions-1406.ivf.md5
+6ca909bf168a64c09415626294665dc1be3d1973 *vp80-05-sharpness-1428.ivf.md5
+1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a *vp80-05-sharpness-1429.ivf.md5
+71bcbe5357d36a19df5b07fbe3e27bffa8893f0a *vp80-05-sharpness-1430.ivf.md5
+89a09b1dffce2d55770a89e58d9925c70ef79bf8 *vp80-05-sharpness-1431.ivf.md5
+08444a18b4e6ba3450c0796dd728d48c399a2dc9 *vp80-05-sharpness-1433.ivf.md5
+6d6223719a90c13e848aa2a8a6642098cdb5977a *vp80-05-sharpness-1434.ivf.md5
+41d70bb5fa45bc88da1604a0af466930b8dd77b5 *vp80-05-sharpness-1438.ivf.md5
+086c56378df81b6cee264d7540a7b8f2b405c7a4 *vp80-05-sharpness-1439.ivf.md5
+d32dc2c4165eb266ea4c23c14a45459b363def32 *vp80-05-sharpness-1440.ivf.md5
+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df *vp80-05-sharpness-1443.ivf.md5
+d6f246df012c241b5fa6c1345019a3703d85c419 *vp80-06-smallsize.ivf.md5
+ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0 *vp90-2-00-quantizer-00.webm
+ac5eda33407d0521c7afca43a63fd305c0cd9d13 *vp90-2-00-quantizer-00.webm.md5
+2ca0463f2cfb93d25d7dded174db70b7cb87cb48 *vp90-2-00-quantizer-01.webm
+10d98884fc6d9a5f47a2057922b8e25dd48d7786 *vp90-2-00-quantizer-01.webm.md5
+d80a2920a5e0819d69dcba8fe260c01f820f8982 *vp90-2-00-quantizer-02.webm
+c964c8e5e04165fabbf1c6ee8ee5121d35921965 *vp90-2-00-quantizer-02.webm.md5
+fdef046777b5b75c962b715d809dbe2ea331afb9 *vp90-2-00-quantizer-03.webm
+f270bee0b0c7aa2bf4c5afe098556b4f3f890faf *vp90-2-00-quantizer-03.webm.md5
+66d98609e809394a6ac730787e6724e3badc075a *vp90-2-00-quantizer-04.webm
+427433bfe121c4aea1095ec3124fdc174d200e3a *vp90-2-00-quantizer-04.webm.md5
+e6e42626d8cadf0b5be16313f69212981b96fee5 *vp90-2-00-quantizer-05.webm
+c98f6a9a1af4cfd71416792827304266aad4bd46 *vp90-2-00-quantizer-05.webm.md5
+413ef09b721f5dcec1a96e937a97e5873c2e6db6 *vp90-2-00-quantizer-06.webm
+5080e940a23805c82e578e21b57fc2c511e76376 *vp90-2-00-quantizer-06.webm.md5
+4a50a5f4ac717c30dfaae8bb46702e3542e867de *vp90-2-00-quantizer-07.webm
+76c429a02b56762e10ee4db88729d8834b3a70f4 *vp90-2-00-quantizer-07.webm.md5
+d2f4e464780bf8b7e647efa18ac777a930e62bc0 *vp90-2-00-quantizer-08.webm
+ab94aabf9316111b52d7c531962ed4123313b6ba *vp90-2-00-quantizer-08.webm.md5
+174bc58433936dd79550398d744f1072ce7f5693 *vp90-2-00-quantizer-09.webm
+e1f7690cd83ccc56d045e17cce552544a5f03810 *vp90-2-00-quantizer-09.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-00-quantizer-10.webm
+9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8 *vp90-2-00-quantizer-10.webm.md5
+10031eecafde1e1d8e6323fe2b2a1d7e77a66869 *vp90-2-00-quantizer-11.webm
+fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0 *vp90-2-00-quantizer-11.webm.md5
+78e9f7bb77e8e348155bbdfa12790789d1d50c34 *vp90-2-00-quantizer-12.webm
+0961d060cc8dd469c6dac8d7d75f927c0bb971b8 *vp90-2-00-quantizer-12.webm.md5
+133b77a3bbcef652552d74ffc46afbfe3b8a1cba *vp90-2-00-quantizer-13.webm
+df29e5e0f95772af482f540d776f6b9dea4bfa29 *vp90-2-00-quantizer-13.webm.md5
+27323afdaf8987e025c27129c74c86502315a206 *vp90-2-00-quantizer-14.webm
+ce96a2cc312942f0427a463f15a392870dd69764 *vp90-2-00-quantizer-14.webm.md5
+ab58d0b41037829f6bc993910999f4af0212aafd *vp90-2-00-quantizer-15.webm
+40f700db606501aa7cb49049624cbdde6409b122 *vp90-2-00-quantizer-15.webm.md5
+cd948e66448aafb65998815ce37241f95d7c9ee7 *vp90-2-00-quantizer-16.webm
+039b742d149c945ed79c7b9a6384352852a1c116 *vp90-2-00-quantizer-16.webm.md5
+62f56e663e13c576764e491cf08f19bd46a71999 *vp90-2-00-quantizer-17.webm
+90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e *vp90-2-00-quantizer-17.webm.md5
+f26ecad7263cd66a614e53ba5d7c00df181affeb *vp90-2-00-quantizer-18.webm
+cda0a1c0fca2ec2976ae55124a8a67305508bae6 *vp90-2-00-quantizer-18.webm.md5
+94bfc4c04fcfe139a63b98c569e8c14ba98c401f *vp90-2-00-quantizer-19.webm
+5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f *vp90-2-00-quantizer-19.webm.md5
+0ee88e9318985e1e245de78c2c4a665885ab76a7 *vp90-2-00-quantizer-20.webm
+4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063 *vp90-2-00-quantizer-20.webm.md5
+6a995cb2b1db33da8087321df1e646f95c3e32d1 *vp90-2-00-quantizer-21.webm
+e216b4a1eceac03efcc433759be54ab8ea87b24b *vp90-2-00-quantizer-21.webm.md5
+aa7722fc427e7180115f3c9cd96bb6b2768e7296 *vp90-2-00-quantizer-22.webm
+1aa813bd45ae831bf5e79ace4d73dfd25989a07d *vp90-2-00-quantizer-22.webm.md5
+7677e5b929ed6d142041f19b8a9cd5822ee1504a *vp90-2-00-quantizer-23.webm
+0de0af34abd843d5b37e58baf3ed96a6104b64c3 *vp90-2-00-quantizer-23.webm.md5
+b2995cbe1128b2d4926f1b28d01c501ecb6be8c8 *vp90-2-00-quantizer-24.webm
+db6033af2ba2f2bca62468fb4b8808e474f93923 *vp90-2-00-quantizer-24.webm.md5
+8135ba35587fd92cd4667be7896323d9b634401c *vp90-2-00-quantizer-25.webm
+3499e00c2cc15876f61f07e3d3cfca54ebcd98fd *vp90-2-00-quantizer-25.webm.md5
+af0fa2907746db82d345f6d831fcc1b2862a29fb *vp90-2-00-quantizer-26.webm
+cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7 *vp90-2-00-quantizer-26.webm.md5
+bd0002e91323776beb5ff11e06edcf19fc08e9b9 *vp90-2-00-quantizer-27.webm
+fe72154ef196067d6c272521012dd79706496cac *vp90-2-00-quantizer-27.webm.md5
+fc15eb606f81455ff03df16bf3432296b002c43c *vp90-2-00-quantizer-28.webm
+40b2e24b542206a6bfd746ef199e49ccea07678a *vp90-2-00-quantizer-28.webm.md5
+3090bbf913cad0b2eddca7228f5ed51a58378b8d *vp90-2-00-quantizer-29.webm
+eb59745e0912d8ed6c928268bcf265237c9ba93f *vp90-2-00-quantizer-29.webm.md5
+c615abdca9c25e1cb110d908edbedfb3b7c92b91 *vp90-2-00-quantizer-30.webm
+ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0 *vp90-2-00-quantizer-30.webm.md5
+037d9f242086cfb085518f6416259defa82d5fc2 *vp90-2-00-quantizer-31.webm
+4654b40792572f0a790874c6347ef9196d86c1a7 *vp90-2-00-quantizer-31.webm.md5
+505899f3f3515044c5c8b3213d9b9d16f614619d *vp90-2-00-quantizer-32.webm
+659a2e6dd02df323f62600626859006640b445df *vp90-2-00-quantizer-32.webm.md5
+8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc *vp90-2-00-quantizer-33.webm
+5b175ef1120ddeba4feae1247bf381bbc4e816ce *vp90-2-00-quantizer-33.webm.md5
+4d283755d17e287b1d099a80604398f60d7fb6ea *vp90-2-00-quantizer-34.webm
+22a739de95acfeb27524e3700b8f678a9ad744d8 *vp90-2-00-quantizer-34.webm.md5
+4296f56a892a412d3d4f64824718dd566c4e6459 *vp90-2-00-quantizer-35.webm
+c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7 *vp90-2-00-quantizer-35.webm.md5
+6f54e11da461e4410dd9075b015e2d9bc1d07dfb *vp90-2-00-quantizer-36.webm
+0b3573f5addea4e3eb11a0b85f068299d5bdad78 *vp90-2-00-quantizer-36.webm.md5
+210581682a26c2c4375efc785c36e07539888bc2 *vp90-2-00-quantizer-37.webm
+2b4fb6f8ba975237858e61cc8f560bcfc87cb38e *vp90-2-00-quantizer-37.webm.md5
+a15ef31283dfc4860f837fe200eb32a445f59629 *vp90-2-00-quantizer-38.webm
+fb76771f3a795054b9936f70da7505c3ac585284 *vp90-2-00-quantizer-38.webm.md5
+1df8433a441412831daae6726df89fa70d21b14d *vp90-2-00-quantizer-39.webm
+39e162c09a20e7e684868097766347014371fee6 *vp90-2-00-quantizer-39.webm.md5
+5330e4788ab9129dbb25a7a7d5411104521248b6 *vp90-2-00-quantizer-40.webm
+872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1 *vp90-2-00-quantizer-40.webm.md5
+d88d03b982889e399a78d7a06eeb1cf30e6c2da2 *vp90-2-00-quantizer-41.webm
+5b4f7217e57fa2a221011d0b32f8d0409496b7b6 *vp90-2-00-quantizer-41.webm.md5
+9e16406e3e26955a6e17d455ef1ef64bbfa26e53 *vp90-2-00-quantizer-42.webm
+0219d090cf37daabe19256ba8e932ba4874b92e4 *vp90-2-00-quantizer-42.webm.md5
+a9b15843486fb05f8cd15437ef279782a42b75db *vp90-2-00-quantizer-43.webm
+3c9b0b4c607f9579a31726bfcf56729334ddc686 *vp90-2-00-quantizer-43.webm.md5
+1dbc931ac446c91eabe7213efff55b596cccf07c *vp90-2-00-quantizer-44.webm
+73bc8f675103abaef3d9f73a2742b3bffd726d23 *vp90-2-00-quantizer-44.webm.md5
+7c6c1be15beb9d6201204b018966c8c4f9777efc *vp90-2-00-quantizer-45.webm
+c907b29da821f790c6748de61f592689312e4e36 *vp90-2-00-quantizer-45.webm.md5
+07b434da1a467580f73b32177ee11b3e00f65a0d *vp90-2-00-quantizer-46.webm
+7b2b7ce60c50bc970bc0ada46d7a7ce440148da3 *vp90-2-00-quantizer-46.webm.md5
+233d0465fb1a6fa36e9f89bd2193ac79bd4d2809 *vp90-2-00-quantizer-47.webm
+527e0a9fb932efe915027ffe077f9e8d3a4fb139 *vp90-2-00-quantizer-47.webm.md5
+719613df7307e205c3fdb6acfb373849c5ab23c7 *vp90-2-00-quantizer-48.webm
+65ab6c9d1b682c183b201c7ff42b90343ce3e304 *vp90-2-00-quantizer-48.webm.md5
+3bf04a598325ed0eabae1598ec7f718f715ec672 *vp90-2-00-quantizer-49.webm
+ac68c4387ce11fcc998d8ba455ab9b2bb361d240 *vp90-2-00-quantizer-49.webm.md5
+d59238fb3a654931c9b65a11e7321b40d1f702e9 *vp90-2-00-quantizer-50.webm
+d0576bfede46fd55659f028f2fd28554ceb3e6cc *vp90-2-00-quantizer-50.webm.md5
+3f579785101d4209360dd96f8c2ffe9beddf3bee *vp90-2-00-quantizer-51.webm
+89fcfe04f4457a7f02ab4a2f94aacbb88aee5789 *vp90-2-00-quantizer-51.webm.md5
+28be5836e2fedefe4babf12fc9b79e460ab0a0f4 *vp90-2-00-quantizer-52.webm
+f3dd52b70c18345fee740220f35da9c4def2017a *vp90-2-00-quantizer-52.webm.md5
+488ad4058c17170665b6acd1021fade9a02771e4 *vp90-2-00-quantizer-53.webm
+1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7 *vp90-2-00-quantizer-53.webm.md5
+682978289cb28cc8c9d39bc797300e45d6039de7 *vp90-2-00-quantizer-54.webm
+36c35353f2c03cb099bd710d9994de7d9ed88834 *vp90-2-00-quantizer-54.webm.md5
+c398ce49af762a48f10cc4da9fae0769aae5f226 *vp90-2-00-quantizer-55.webm
+2cf3570542d984f167ab087f59493c7fb47e0ed2 *vp90-2-00-quantizer-55.webm.md5
+3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3 *vp90-2-00-quantizer-56.webm
+d3f93f8272b6de31cffb011a26f11abb514efb12 *vp90-2-00-quantizer-56.webm.md5
+f4e8e14b1f278801a7eb6f11734780a01b1668e9 *vp90-2-00-quantizer-57.webm
+6478fdf1d7faf6db5f19dffc5e1363af358699ee *vp90-2-00-quantizer-57.webm.md5
+307dc264f57cc618fff211fa44d7f52767ed9660 *vp90-2-00-quantizer-58.webm
+cf231d4a52d492fa692ea4194ec5eb7511fec54e *vp90-2-00-quantizer-58.webm.md5
+1fd7cd596170afce2de0b1441b7674bda5723440 *vp90-2-00-quantizer-59.webm
+4681f7ef96f63e085c41bb1a964b0df7e67e0b38 *vp90-2-00-quantizer-59.webm.md5
+34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62 *vp90-2-00-quantizer-60.webm
+58691ef53b6b623810e2c57ded374c77535df935 *vp90-2-00-quantizer-60.webm.md5
+e6e812406aab81021bb16e772c1db03f75906cb6 *vp90-2-00-quantizer-61.webm
+76436eace62f08ff92b61a0845e66667a027db1b *vp90-2-00-quantizer-61.webm.md5
+84d811bceed70c950a6a08e572a6e274866e72b1 *vp90-2-00-quantizer-62.webm
+2d937cc011eeddd95222b960982da5cd18db580f *vp90-2-00-quantizer-62.webm.md5
+0912b295ba0ea09359315315ffd67d22d046f883 *vp90-2-00-quantizer-63.webm
+5a829031055d70565f57dbcd47a6ac33619952b3 *vp90-2-00-quantizer-63.webm.md5
+0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00 *vp90-2-01-sharpness-1.webm
+5a0476be4448bae8f8ca17ea236c98793a755948 *vp90-2-01-sharpness-1.webm.md5
+51e02d7911810cdf5be8b68ac40aedab479a3179 *vp90-2-01-sharpness-2.webm
+a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6 *vp90-2-01-sharpness-2.webm.md5
+0603f8ad239c07a531d948187f4dafcaf51eda8d *vp90-2-01-sharpness-3.webm
+3af8000a69c72fe77881e3176f026c2affb78cc7 *vp90-2-01-sharpness-3.webm.md5
+4ca4839f48146252fb261ed88838d80211804841 *vp90-2-01-sharpness-4.webm
+08832a1494f84fa9edd40e080bcf2c0e80100c76 *vp90-2-01-sharpness-4.webm.md5
+95099dc8f9cbaf9b9a7dd65311923e441ff70731 *vp90-2-01-sharpness-5.webm
+93ceee30c140f0b406726c0d896b9db6031c4c7f *vp90-2-01-sharpness-5.webm.md5
+ceb4116fb7b078d266d153233b6d62a255a34e4c *vp90-2-01-sharpness-6.webm
+da83efe59e537ce538e8b03a6eac63cf25849c9a *vp90-2-01-sharpness-6.webm.md5
+b5f7cd19aece3880f9d616a778e5cc24c6b9b505 *vp90-2-01-sharpness-7.webm
+2957408d20deac8633941a2169f801bae6f086e1 *vp90-2-01-sharpness-7.webm.md5
+ffc096c2ce1050450ad462b5fabd2a5220846319 *vp90-2-02-size-08x08.webm
+e36d2ed6fa2746347710b750586aafa6a01ff3ae *vp90-2-02-size-08x08.webm.md5
+895b986f9fd55cd879472b31c6a06b82094418c8 *vp90-2-02-size-08x10.webm
+079157a19137ccaebba606f2871f45a397347150 *vp90-2-02-size-08x10.webm.md5
+1c5992203e62a2b83040ccbecd748b604e19f4c0 *vp90-2-02-size-08x16.webm
+9aa45ffdf2078f883bbed01450031b691819c144 *vp90-2-02-size-08x16.webm.md5
+d0a8953da1f85f484487408fee5da9e2a8391901 *vp90-2-02-size-08x18.webm
+59a5cc17d354c6a23e5e959d666b1456a5d49c56 *vp90-2-02-size-08x18.webm.md5
+1b13461a9fc65cb041bacfe4ea6f02d363397d61 *vp90-2-02-size-08x32.webm
+2bdddd6878f05d37d84cde056a3f5e7f926ba3d6 *vp90-2-02-size-08x32.webm.md5
+2861f0a0daadb62295b0504a1fbe5b50c79a8f59 *vp90-2-02-size-08x34.webm
+6b5812cfb8a82d378ea2913bf009e93668020147 *vp90-2-02-size-08x34.webm.md5
+02f948216d4246579dc53c47fe55d8fb264ba251 *vp90-2-02-size-08x64.webm
+84b55fdee6d9aa820c7a8c62822446184b191767 *vp90-2-02-size-08x64.webm.md5
+4b011242cbf42516efd2b197baebb61dd34562c9 *vp90-2-02-size-08x66.webm
+6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b *vp90-2-02-size-08x66.webm.md5
+4057796be9dd12df48ab607f502ae6aa70eeeab6 *vp90-2-02-size-10x08.webm
+71c752c51aec9f48de286b93f4c20e9c11cad7d0 *vp90-2-02-size-10x08.webm.md5
+6583c853fa43fc53d51743eac5f3a43a359d45d0 *vp90-2-02-size-10x10.webm
+1da524d24af1944b671d4d3f2b398d6e336584c3 *vp90-2-02-size-10x10.webm.md5
+ba442fc03ccd3a705c64c83b36f5ada67d198874 *vp90-2-02-size-10x16.webm
+7cfd960f232c34c641a4a2a9411b6fd0efb2fc50 *vp90-2-02-size-10x16.webm.md5
+cc92ed40eef14f52e4d080cb2c57939dd8326374 *vp90-2-02-size-10x18.webm
+db5626275cc55ce970b91c995e74f6838d943aca *vp90-2-02-size-10x18.webm.md5
+3a93d501d22325e9fd4c9d8b82e2a432de33c351 *vp90-2-02-size-10x32.webm
+5cae51b0c71cfc131651f345f87583eb2903afaf *vp90-2-02-size-10x32.webm.md5
+50d2f2b15a9a5178153db44a9e03aaf32b227f67 *vp90-2-02-size-10x34.webm
+bb0efe058122641e7f73e94497dda2b9e6c21efd *vp90-2-02-size-10x34.webm.md5
+01624ec173e533e0b33fd9bdb91eb7360c7c9175 *vp90-2-02-size-10x64.webm
+b9c0e3b054463546356acf5157f9be92fd34732f *vp90-2-02-size-10x64.webm.md5
+2942879baf1c09e96b14d0fc84806abfe129c706 *vp90-2-02-size-10x66.webm
+bab5f539c2f91952e187456b4beafbb4c01e25ee *vp90-2-02-size-10x66.webm.md5
+88d2b63ca5e9ee163d8f20e8886f3df3ff301a66 *vp90-2-02-size-16x08.webm
+7f48a0fcf8c25963f3057d7f6669c5f2415834b8 *vp90-2-02-size-16x08.webm.md5
+59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f *vp90-2-02-size-16x10.webm
+73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81 *vp90-2-02-size-16x10.webm.md5
+066834fef9cf5b9a72932cf4dea5f253e14a976d *vp90-2-02-size-16x16.webm
+faec542f52f37601cb9c480d887ae9355be99372 *vp90-2-02-size-16x16.webm.md5
+195307b4eb3192271ee4a935b0e48deef0c54cc2 *vp90-2-02-size-16x18.webm
+5a92e19e624c0376321d4d0e22c0c91995bc23e1 *vp90-2-02-size-16x18.webm.md5
+14f3f884216d7ae16ec521f024a2f2d31bbf9c1a *vp90-2-02-size-16x32.webm
+ea622d1c817dd174556f7ee7ccfe4942b34d4845 *vp90-2-02-size-16x32.webm.md5
+2e0501100578a5da9dd47e4beea160f945bdd1ba *vp90-2-02-size-16x34.webm
+1b8645ef64239334921c5f56b24ce815e6070b05 *vp90-2-02-size-16x34.webm.md5
+89a6797fbebebe93215f367229a9152277f5dcfe *vp90-2-02-size-16x64.webm
+a03d8c1179ca626a8856fb416d635dbf377979cd *vp90-2-02-size-16x64.webm.md5
+0f3a182e0750fcbae0b9eae80c7a53aabafdd18d *vp90-2-02-size-16x66.webm
+8cb6736dc2d897c1283919a32068af377d66c59c *vp90-2-02-size-16x66.webm.md5
+68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1 *vp90-2-02-size-18x08.webm
+874c7fb505be9db3160c57cb405c4dbd5b990dc2 *vp90-2-02-size-18x08.webm.md5
+0546352dd78496d4dd86c3727ac2ff36c9e72032 *vp90-2-02-size-18x10.webm
+1d80eb36557ea5f25a386495a36f93da0f25316b *vp90-2-02-size-18x10.webm.md5
+60fe99e5f5cc99706efa3e0b894e45cbcf0d6330 *vp90-2-02-size-18x16.webm
+1ab6cdd89a53662995d103546e6611c84f9292ab *vp90-2-02-size-18x16.webm.md5
+f9a8f5fb749d69fd555db6ca093b7f77800c7b4f *vp90-2-02-size-18x18.webm
+ace8a66328f7802b15f9989c2720c029c6abd279 *vp90-2-02-size-18x18.webm.md5
+a197123a527ec25913a9bf52dc8c347749e00045 *vp90-2-02-size-18x32.webm
+34fbd7036752232d1663e70d7f7cdc93f7129202 *vp90-2-02-size-18x32.webm.md5
+f219655a639a774a2c9c0a9f45c28dc0b5e75e24 *vp90-2-02-size-18x34.webm
+2c4d622a9ea548791c1a07903d3702e9774388bb *vp90-2-02-size-18x34.webm.md5
+5308578da48c677d477a5404e19391d1303033c9 *vp90-2-02-size-18x64.webm
+e7fd4462527bac38559518ba80e41847db880f15 *vp90-2-02-size-18x64.webm.md5
+e109a7e013bd179f97e378542e1e81689ed06802 *vp90-2-02-size-18x66.webm
+45c04e422fb383c1f3be04beefaa4490e83bdb1a *vp90-2-02-size-18x66.webm.md5
+38844cae5d99caf445f7de33c3ae78494ce36c01 *vp90-2-02-size-32x08.webm
+ad018be39e493ca2405225034b1a5b7a42af6f3a *vp90-2-02-size-32x08.webm.md5
+7b57eaad55906f9de9903c8657a3fcb2aaf792ea *vp90-2-02-size-32x10.webm
+2294425d4e55d275af5e25a0beac9738a1b4ee73 *vp90-2-02-size-32x10.webm.md5
+f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1 *vp90-2-02-size-32x16.webm
+ae10981d93913f0ab1f28c1146255e01769aa8c0 *vp90-2-02-size-32x16.webm.md5
+08b23ad838b6cf1fbfe3ad7e7775d95573e815fc *vp90-2-02-size-32x18.webm
+1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8 *vp90-2-02-size-32x18.webm.md5
+d5b88ae6c8c25c53dee74d9f1e6ca64244349a57 *vp90-2-02-size-32x32.webm
+e39c067a8ee2da52a51641eb1cb7f8eba935eb6b *vp90-2-02-size-32x32.webm.md5
+529429920dc36bd899059fa75a767f02c8c60874 *vp90-2-02-size-32x34.webm
+56888e7834f52b106e8911e3a7fc0f473b609995 *vp90-2-02-size-32x34.webm.md5
+38e848e160391c2b1a55040aadde613b9f4bf15e *vp90-2-02-size-32x64.webm
+8950485fb3f68b0e8be234db860e4ec5f5490fd0 *vp90-2-02-size-32x64.webm.md5
+5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94 *vp90-2-02-size-32x66.webm
+225df9d7d72ec711b0b60f4aeb65311c97db054a *vp90-2-02-size-32x66.webm.md5
+695f929e2ce6fb11a1f180322d46c5cb1c97fa61 *vp90-2-02-size-34x08.webm
+5bb4262030018dd01883965c6aa6070185924ef6 *vp90-2-02-size-34x08.webm.md5
+5adf74ec906d2ad3f7526e06bd29f5ad7d966a90 *vp90-2-02-size-34x10.webm
+71c100b437d3e8701632ae8d65c3555339b1c68f *vp90-2-02-size-34x10.webm.md5
+d0918923c987fba2d00193d83797b21289fe54aa *vp90-2-02-size-34x16.webm
+5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d *vp90-2-02-size-34x16.webm.md5
+553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd *vp90-2-02-size-34x18.webm
+a164c7f3c424987df2340496e6a8cf76e973f0f1 *vp90-2-02-size-34x18.webm.md5
+baf3e233634f150de81c18ba5d8848068e1c3c54 *vp90-2-02-size-34x32.webm
+22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03 *vp90-2-02-size-34x32.webm.md5
+6d50a533774a7167350e4a7ef43c94a5622179a2 *vp90-2-02-size-34x34.webm
+0c099638e79c273546523e06704553e42eb00b00 *vp90-2-02-size-34x34.webm.md5
+698cdd0a5e895cc202c488675e682a8c537ede4f *vp90-2-02-size-34x64.webm
+9317b63987cddab8389510a27b86f9f3d46e3fa5 *vp90-2-02-size-34x64.webm.md5
+4b5335ca06f082b6b69f584eb8e7886bdcafefd3 *vp90-2-02-size-34x66.webm
+e18d68b35428f46a84a947c646804a51ef1d7cec *vp90-2-02-size-34x66.webm.md5
+a54ae7b494906ec928a876e8290e5574f2f9f6a2 *vp90-2-02-size-64x08.webm
+87f9f7087b6489d45e9e4b38ede2c5aef4a4928f *vp90-2-02-size-64x08.webm.md5
+24522c70804a3c23d937df2d829ae63965b23f38 *vp90-2-02-size-64x10.webm
+447ce03938ab53bffcb4a841ee0bfaa90462dcb9 *vp90-2-02-size-64x10.webm.md5
+2a5035d035d214ae614af8051930690ef623989b *vp90-2-02-size-64x16.webm
+84e355761dd2e0361b904c84c52a0dd0384d89cf *vp90-2-02-size-64x16.webm.md5
+3a293ef4e270a19438e59b817fbe5f43eed4d36b *vp90-2-02-size-64x18.webm
+666824e5ba746779eb46079e0631853dcc86d48b *vp90-2-02-size-64x18.webm.md5
+ed32fae837095c9e8fc95d223ec68101812932c2 *vp90-2-02-size-64x32.webm
+97086eadedce1d0d9c072b585ba7b49aec69b1e7 *vp90-2-02-size-64x32.webm.md5
+696c7a7250bdfff594f4dfd88af34239092ecd00 *vp90-2-02-size-64x34.webm
+253a1d38d452e7826b086846c6f872f829c276bb *vp90-2-02-size-64x34.webm.md5
+fc508e0e3c2e6872c60919a60b812c5232e9c2b0 *vp90-2-02-size-64x64.webm
+2cd6ebeca0f82e9f505616825c07950371b905ab *vp90-2-02-size-64x64.webm.md5
+0f8a4fc1d6521187660425c283f08dff8c66e476 *vp90-2-02-size-64x66.webm
+5806be11a1d346be235f88d3683e69f73746166c *vp90-2-02-size-64x66.webm.md5
+273b0c36e3658685cde250408a478116d7ae92f1 *vp90-2-02-size-66x08.webm
+23c3cd0dca20a2f71f036e77ea92025ff4e7a298 *vp90-2-02-size-66x08.webm.md5
+4844c59c3306d1e671bb0568f00e344bf797e66e *vp90-2-02-size-66x10.webm
+e041eaf6841d775f8fde8bbb4949d2733fdaab7f *vp90-2-02-size-66x10.webm.md5
+bdf3f1582b234fcd2805ffec59f9d716a2345302 *vp90-2-02-size-66x16.webm
+2ec85ee18119e6798968571ea6e1b93ca386e3af *vp90-2-02-size-66x16.webm.md5
+0acce9af12b13b025d5274013da7ef6f568f075f *vp90-2-02-size-66x18.webm
+77c4d53e2a5c96b70af9d575fe6811e0f5ee627b *vp90-2-02-size-66x18.webm.md5
+682b36a25774bbdedcd603f504d18eb63f0167d4 *vp90-2-02-size-66x32.webm
+53728fae2a428f16d376a29f341a64ddca97996a *vp90-2-02-size-66x32.webm.md5
+e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6 *vp90-2-02-size-66x34.webm
+f69a6a555e3f614b0a35f9bfc313d8ebb35bc725 *vp90-2-02-size-66x34.webm.md5
+4151b8c29452d5c2266397a7b9bf688899a2937b *vp90-2-02-size-66x64.webm
+69486e7fd9e380b6c97a03d3e167affc79f73840 *vp90-2-02-size-66x64.webm.md5
+68784a1ecac776fe2a3f230345af32f06f123536 *vp90-2-02-size-66x66.webm
+7f008c7f48d55e652fbd6bac405b51e0015c94f2 *vp90-2-02-size-66x66.webm.md5
+7e1bc449231ac1c5c2a11c9a6333b3e828763798 *vp90-2-03-size-196x196.webm
+6788a561466dace32d500194bf042e19cccc35e1 *vp90-2-03-size-196x196.webm.md5
+a170c9a88ec1dd854c7a471ff55fb2a97ac31870 *vp90-2-03-size-196x198.webm
+6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2 *vp90-2-03-size-196x198.webm.md5
+68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f *vp90-2-03-size-196x200.webm
+bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e *vp90-2-03-size-196x200.webm.md5
+fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1 *vp90-2-03-size-196x202.webm
+158ee72af578f39aad0c3b8f4cbed2fc78b57e0f *vp90-2-03-size-196x202.webm.md5
+dd28fb7247af534bdf5e6795a3ac429610489a0b *vp90-2-03-size-196x208.webm
+7546be847efce2d1c0a23f807bfb03f91b764e1e *vp90-2-03-size-196x208.webm.md5
+41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8 *vp90-2-03-size-196x210.webm
+9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1 *vp90-2-03-size-196x210.webm.md5
+5007bc618143437c009d6dde5fc2e86f72d37dc2 *vp90-2-03-size-196x224.webm
+858361d8f79b44df5545feabbc9754ec9ede632f *vp90-2-03-size-196x224.webm.md5
+0bcbe357fbc776c3fa68e7117179574ed7564a44 *vp90-2-03-size-196x226.webm
+72006a5f42031a43d70a2cd9fc1958962a86628f *vp90-2-03-size-196x226.webm.md5
+000239f048cceaac055558e97ef07078ebf65502 *vp90-2-03-size-198x196.webm
+2d6841901b72000c5340f30be602853438c1b787 *vp90-2-03-size-198x196.webm.md5
+ae75b766306a6404c3b3b35a6b6d53633c14fbdb *vp90-2-03-size-198x198.webm
+3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1 *vp90-2-03-size-198x198.webm.md5
+95ffd573fa84ccef1cd59e1583e6054f56a5c83d *vp90-2-03-size-198x200.webm
+5d537e3c9b9c54418c79677543454c4cda3de1af *vp90-2-03-size-198x200.webm.md5
+ecc845bf574375f469bc91bf5c75c79dc00073d6 *vp90-2-03-size-198x202.webm
+1b59f5e111265615a7a459eeda8cc9045178d228 *vp90-2-03-size-198x202.webm.md5
+432fb27144fe421b9f51cf44d2750a26133ed585 *vp90-2-03-size-198x208.webm
+a58a67f4fb357c73ca078aeecbc0f782975630b1 *vp90-2-03-size-198x208.webm.md5
+ff5058e7e6a47435046612afc8536f2040989e6f *vp90-2-03-size-198x210.webm
+18d3be7935e52217e2e9400b6f2c681a9e45dc89 *vp90-2-03-size-198x210.webm.md5
+a0d55263c1ed2c03817454dd4ec4090d36dbc864 *vp90-2-03-size-198x224.webm
+efa366a299817e2da51c00623b165aab9fbb8d91 *vp90-2-03-size-198x224.webm.md5
+ccd142fa2920fc85bb753f049160c1c353ad1574 *vp90-2-03-size-198x226.webm
+534524a0b2dbff852e0b92ef09939db072f83243 *vp90-2-03-size-198x226.webm.md5
+0d483b94ed40abc8ab6e49f960432ee54ad9c7f1 *vp90-2-03-size-200x196.webm
+41795f548181717906e7a504ba551f06c32102ae *vp90-2-03-size-200x196.webm.md5
+f6c2dc54e0989d50f01333fe40c91661fcbf849a *vp90-2-03-size-200x198.webm
+43df5d8c46a40089441392e6d096c588c1079a68 *vp90-2-03-size-200x198.webm.md5
+2f6e9df82e44fc145f0d9212dcccbed3de605e23 *vp90-2-03-size-200x200.webm
+757b2ef96b82093255725bab9690bbafe27f3caf *vp90-2-03-size-200x200.webm.md5
+40c5ea60415642a4a2e75c0d127b06309baadfab *vp90-2-03-size-200x202.webm
+3022c4a1c625b5dc04fdb1052d17d45b4171cfba *vp90-2-03-size-200x202.webm.md5
+6942ed5b27476bb8506d10e600d6ff60887780ca *vp90-2-03-size-200x208.webm
+c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be *vp90-2-03-size-200x208.webm.md5
+71dbc99b83c49d1da45589b91eabb98e2f4a7b1e *vp90-2-03-size-200x210.webm
+3f0b40da7eef7974b9bc326562f251feb67d9c7c *vp90-2-03-size-200x210.webm.md5
+6b6b8489081cfefb377cc5f18eb754ec2383f655 *vp90-2-03-size-200x224.webm
+a259df2ac0e294492e3f9d4315baa34cab044f04 *vp90-2-03-size-200x224.webm.md5
+c9adc1c9bb07559349a0b054df4af56f7a6edbb9 *vp90-2-03-size-200x226.webm
+714cec61e3575581e4f1a0e3921f4dfdbbd316c5 *vp90-2-03-size-200x226.webm.md5
+f9bdc936bdf53f8be9ce78fecd41a21d31ff3943 *vp90-2-03-size-202x196.webm
+5b8e2e50fcea2c43b12fc067b8a9cc117af77bda *vp90-2-03-size-202x196.webm.md5
+c7b66ea3da87613deb47ff24a111247d3c384fec *vp90-2-03-size-202x198.webm
+517e91204b25586da943556f4adc5951c9be8bee *vp90-2-03-size-202x198.webm.md5
+935ef56b01cfdb4265a7e24696645209ccb20970 *vp90-2-03-size-202x200.webm
+55b8ec4a2513183144a8e27564596c06c7576fce *vp90-2-03-size-202x200.webm.md5
+849acf75e4f1d8d90046704e1103a18c64f30e35 *vp90-2-03-size-202x202.webm
+c79afc6660df2824e7df314e5bfd71f0d8acf76b *vp90-2-03-size-202x202.webm.md5
+17b3a4d55576b770626ccb856b9f1a6c8f6ae476 *vp90-2-03-size-202x208.webm
+0b887ff30409c58f2ccdc3bfacd6be7c69f8997a *vp90-2-03-size-202x208.webm.md5
+032d0ade4230fb2eef6d19915a7a1c9aa4a52617 *vp90-2-03-size-202x210.webm
+f78f8e79533c0c88dd2bfdcec9b1c07848568ece *vp90-2-03-size-202x210.webm.md5
+915a38c31fe425d5b93c837121cfa8082f5ea5bc *vp90-2-03-size-202x224.webm
+bf52a104074d0c5942aa7a5b31e11db47e43d48e *vp90-2-03-size-202x224.webm.md5
+be5cfde35666fa435e47d544d9258215beb1cf29 *vp90-2-03-size-202x226.webm
+2fa2f87502fda756b319389c8975204e130a2e3f *vp90-2-03-size-202x226.webm.md5
+15d908e97862b5b4bf295610df011fb9aa09909b *vp90-2-03-size-208x196.webm
+50c60792305d6a99be376dd596a6ff979325e6cc *vp90-2-03-size-208x196.webm.md5
+a367c7bc9fde56d6f4848cc573c7d4c1ce75e348 *vp90-2-03-size-208x198.webm
+be85fb2c8d435a75484231356f07d06ebddd13cd *vp90-2-03-size-208x198.webm.md5
+05fd46deb7288e7253742091f56e54a9a441a187 *vp90-2-03-size-208x200.webm
+74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c *vp90-2-03-size-208x200.webm.md5
+d8985c4b386513a7385a4b3639bf91e469f1378b *vp90-2-03-size-208x202.webm
+0614a1e8d92048852adcf605a51333f5fabc7f03 *vp90-2-03-size-208x202.webm.md5
+28b002242238479165ba4fb87ee6b442c64b32e4 *vp90-2-03-size-208x208.webm
+37de5aca59bb900228400b0e115d3229edb9dcc0 *vp90-2-03-size-208x208.webm.md5
+c545be0050c2fad7c68427dbf86c62a739e94ab3 *vp90-2-03-size-208x210.webm
+d646eccb3cd578f94b54777e32b88898bef6e17a *vp90-2-03-size-208x210.webm.md5
+63a0cfe295b661026dd7b1bebb67acace1db766f *vp90-2-03-size-208x224.webm
+85c0361d93bf85a335248fef2767ff43eeef23db *vp90-2-03-size-208x224.webm.md5
+f911cc718d66e4fe8a865226088939c9eb1b7825 *vp90-2-03-size-208x226.webm
+a6d583a57876e7b7ec48625b2b2cdbcf70cab837 *vp90-2-03-size-208x226.webm.md5
+5bbb0f36da9a4683cf04e724124d8696332911bf *vp90-2-03-size-210x196.webm
+a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d *vp90-2-03-size-210x196.webm.md5
+8db64d6f9ce36dd382013b42ae4e292deba697bc *vp90-2-03-size-210x198.webm
+eda20f8268c7f4147bead4059e9c4897e09140a9 *vp90-2-03-size-210x198.webm.md5
+ce391505eeaf1d12406563101cd6b2dbbbb44bfc *vp90-2-03-size-210x200.webm
+79d73b7f623082d2a00aa33e95c79d11c7d9c3a8 *vp90-2-03-size-210x200.webm.md5
+852db6fdc206e72391fc69b807f1954934679949 *vp90-2-03-size-210x202.webm
+f69414c5677ed2f2b8b37ae76429e509a92276a5 *vp90-2-03-size-210x202.webm.md5
+c424cc3edd2308da7d33f27acb36b54db5bf2595 *vp90-2-03-size-210x208.webm
+27b18562faa1b3184256f4eae8114b539b3e9d3e *vp90-2-03-size-210x208.webm.md5
+dd029eba719d50a2851592fa8b9b2efe88904930 *vp90-2-03-size-210x210.webm
+c853a1670465eaa04ca31b3511995f1b6ed4f58f *vp90-2-03-size-210x210.webm.md5
+d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3 *vp90-2-03-size-210x224.webm
+93b793e79d987065b39ad8e2e71244368435fc25 *vp90-2-03-size-210x224.webm.md5
+3d0825fe83bcc125be1f78145ff43ca6d7588784 *vp90-2-03-size-210x226.webm
+5230f31a57ca3b5311698a12035d2644533b3ec4 *vp90-2-03-size-210x226.webm.md5
+6622f8bd9279e1ce45509a58a31a990052d45e14 *vp90-2-03-size-224x196.webm
+65411da07f60113f2be05c807879072b161d561e *vp90-2-03-size-224x196.webm.md5
+6744ff2ee2c41eb08c62ff30880833b6d77b585b *vp90-2-03-size-224x198.webm
+46ea3641d41acd4bff347b224646c060d5620385 *vp90-2-03-size-224x198.webm.md5
+8eb91f3416a1404705f370caecd74b2b458351b1 *vp90-2-03-size-224x200.webm
+196aefb854c8b95b9330263d6690b7ee15693ecf *vp90-2-03-size-224x200.webm.md5
+256a5a23ef4e6d5ef2871af5afb8cd13d28cec00 *vp90-2-03-size-224x202.webm
+840ad8455dcf2be378c14b007e66fa642fc8196d *vp90-2-03-size-224x202.webm.md5
+db4606480ab48b96c9a6ff5e639f1f1aea2a12e4 *vp90-2-03-size-224x208.webm
+40b9801d5620467499ac70fa6b7c40aaa5e1c331 *vp90-2-03-size-224x208.webm.md5
+e37159e687fe1cb24cffddfae059301adbaf4212 *vp90-2-03-size-224x210.webm
+1e4acd4b6334ae260c3eed08652d0ba8122073f2 *vp90-2-03-size-224x210.webm.md5
+0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130 *vp90-2-03-size-224x224.webm
+37db449ad86fb286c2c02d94aa8fe0379c05044a *vp90-2-03-size-224x224.webm.md5
+32ebbf903a7d7881bcfe59639f1d472371f3bf27 *vp90-2-03-size-224x226.webm
+5cc3ac5dc9f6912491aa2ddac863f8187f34c569 *vp90-2-03-size-224x226.webm.md5
+9480ff5c2c32b1870ac760c87514912616e6cf01 *vp90-2-03-size-226x196.webm
+fe83655c0f1888f0af7b047785f01ba7ca9f1324 *vp90-2-03-size-226x196.webm.md5
+09cad4221996315cdddad4e502dbfabf53ca1d6a *vp90-2-03-size-226x198.webm
+e3ddfdc650acb95adb45abd9b634e1f09ea8ac96 *vp90-2-03-size-226x198.webm.md5
+c34f49d55fe39e3f0b607e3cc95e30244225cecb *vp90-2-03-size-226x200.webm
+abb83edc868a3523ccd4e5523fac2efbe7c3df1f *vp90-2-03-size-226x200.webm.md5
+d17bc08eedfc60c4c23d576a6c964a21bf854d1f *vp90-2-03-size-226x202.webm
+1d22d2d0f375251c2d5a1acb4714bc35d963865b *vp90-2-03-size-226x202.webm.md5
+9bd537c4f92a25596ccd29fedfe181feac948b92 *vp90-2-03-size-226x208.webm
+6feb0e7325386275719f3511ada9e248a2ae7df4 *vp90-2-03-size-226x208.webm.md5
+4487067f6cedd495b93696b44b37fe0a3e7eda14 *vp90-2-03-size-226x210.webm
+49a8fa87945f47208168d541c068e78d878075d5 *vp90-2-03-size-226x210.webm.md5
+559fea2f8da42b33c1aa1dbc34d1d6781009847a *vp90-2-03-size-226x224.webm
+83c6d8f2969b759e10e5c6542baca1265c874c29 *vp90-2-03-size-226x224.webm.md5
+fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce *vp90-2-03-size-226x226.webm
+94ad19b8b699cea105e2ff18f0df2afd7242bcf7 *vp90-2-03-size-226x226.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c *vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678 *vp90-2-03-deltaq.webm.md5
+4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba *vp90-2-05-resize.ivf
+7f6d8879336239a43dbb6c9f13178cb11cf7ed09 *vp90-2-05-resize.ivf.md5
+bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe *vp90-2-06-bilinear.webm
+f6235f937552e11d8eb331ec55da6b3aa596b9ac *vp90-2-06-bilinear.webm.md5
+0c83a1e414fde3bccd6dc451bbaee68e59974c76 *vp90-2-07-frame_parallel.webm
+e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd *vp90-2-07-frame_parallel.webm.md5
+086c7edcffd699ae7d99d710fd7e53b18910ca5b *vp90-2-08-tile_1x2_frame_parallel.webm
+e981ecaabb29a80e0cbc1f4002384965ce8e95bb *vp90-2-08-tile_1x2_frame_parallel.webm.md5
+ed79be026a6f28646c5825da1c12d1fbc70f96a4 *vp90-2-08-tile_1x2.webm
+45b404e025841c9750895fc1a9f6bd384fe6a315 *vp90-2-08-tile_1x2.webm.md5
+cf8ea970c776797aae71dac8317ea926d9431cab *vp90-2-08-tile_1x4_frame_parallel.webm
+a481fbea465010b57af5a19ebf6d4a5cfe5b9278 *vp90-2-08-tile_1x4_frame_parallel.webm.md5
+0203ec456277a01aec401e7fb6c72c9a7e5e3f9d *vp90-2-08-tile_1x4.webm
+c9b237dfcc01c1b414fbcaa481d014a906ef7998 *vp90-2-08-tile_1x4.webm.md5
+20c75157e91ab41f82f70ffa73d5d01df8469287 *vp90-2-08-tile-4x4.webm
+ae7451810247fd13975cc257aa0301ff17102255 *vp90-2-08-tile-4x4.webm.md5
+2ec6e15422ac7a61af072dc5f27fcaf1942ce116 *vp90-2-08-tile-4x1.webm
+0094f5ee5e46345017c30e0aa4835b550212d853 *vp90-2-08-tile-4x1.webm.md5
+edea45dac4a3c2e5372339f8851d24c9bef803d6 *vp90-2-09-subpixel-00.ivf
+5428efc4bf92191faedf4a727fcd1d94966a7abc *vp90-2-09-subpixel-00.ivf.md5
+8cdd435d89029987ee196896e21520e5f879f04d *vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+091b373aa2ecb59aa5c647affd5bcafcc7547364 *vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+87ee28032b0963a44b73a850fcc816a6dc83efbb *vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f *vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+2064bdb22aa71c2691e0469fb62e8087a43f08f8 *vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+8080eda22694910162f0996e8a962612f381a57f *vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+a484b335c27ea189c0f0d77babea4a510ce12d50 *vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+3eacf1f006250be4cc5c92a7ef146e385ee62653 *vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+217f089a16447490823127b36ce0d945522accfd *vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+eedb3c641e60dacbe082491a16df529a5c9187df *vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+cb7e4955af183dff33bcba0c837f0922ab066400 *vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+48613f9380e2580002f8a09d6e412ea4e89a52b9 *vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+990a91f24dd284562d21d714ae773dff5452cad8 *vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+aa402217577a659cfc670157735b4b8e9aa670fe *vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
+b6dd558c90bca466b4bcbd03b3371648186465a7 *vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+1a9c2914ba932a38f0a143efc1ad0e318e78888b *vp90-2-tos_426x178_tile_1x1_181kbps.webm
+a3d2b09f24debad4747a1b3066f572be4273bced *vp90-2-tos_640x266_tile_1x2_336kbps.webm
+c64b03b5c090e6888cb39685c31f00a6b79fa45c *vp90-2-tos_854x356_tile_1x2_656kbps.webm
+94b533dbcf94292001e27cc51fec87f9e8c90c0b *vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
+0e7cd4135b231c9cea8d76c19f9e84b6fd77acec *vp90-2-08-tile_1x8_frame_parallel.webm
+c9b6850af28579b031791066457f4cb40df6e1c7 *vp90-2-08-tile_1x8_frame_parallel.webm.md5
+e448b6e83490bca0f8d58b4f4b1126a17baf4b0c *vp90-2-08-tile_1x8.webm
+5e524165f0397e6141d914f4f0a66267d7658376 *vp90-2-08-tile_1x8.webm.md5
+a34e14923d6d17b1144254d8187d7f85b700a63c *vp90-2-02-size-lf-1920x1080.webm
+e3b28ddcfaeb37fb4d132b93f92642a9ad17c22d *vp90-2-02-size-lf-1920x1080.webm.md5
+d48c5db1b0f8e60521a7c749696b8067886033a3 *vp90-2-09-aq2.webm
+84c1599298aac78f2fc05ae2274575d10569dfa0 *vp90-2-09-aq2.webm.md5
+55fc55ed73d578ed60fad05692579873f8bad758 *vp90-2-09-lf_deltas.webm
+54638c38009198c38c8f3b25c182b709b6c1fd2e *vp90-2-09-lf_deltas.webm.md5
+510d95f3beb3b51c572611fdaeeece12277dac30 *vp90-2-10-show-existing-frame.webm
+14d631096f4bfa2d71f7f739aec1448fb3c33bad *vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881 *vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821 *vp90-2-10-show-existing-frame2.webm.md5
+b4318e75f73a6a08992c7326de2fb589c2a794c7 *vp90-2-11-size-351x287.webm
+b3c48382cf7d0454e83a02497c229d27720f9e20 *vp90-2-11-size-351x287.webm.md5
+8e0096475ea2535bac71d3e2fc09e0c451c444df *vp90-2-11-size-351x288.webm
+19e003804ec1dfc5464813b32339a15d5ba7b42f *vp90-2-11-size-351x288.webm.md5
+40cd1d6a188d7a88b21ebac1e573d3f270ab261e *vp90-2-11-size-352x287.webm
+68f515abe3858fc1eded46c8e6b2f727d43b5331 *vp90-2-11-size-352x287.webm.md5
+9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_1.ivf
+952eaac6eefa6f62179ed1db3e922fd42fecc624 *vp90-2-12-droppable_1.ivf.md5
+9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_2.ivf
+92a756469fa438220524e7fa6ac1d38c89514d17 *vp90-2-12-droppable_2.ivf.md5
+c21e97e4ba486520118d78b01a5cb6e6dc33e190 *vp90-2-12-droppable_3.ivf
+601abc9e4176c70f82ac0381365e9b151fdd24cd *vp90-2-12-droppable_3.ivf.md5
+61c640dad23cd4f7ad811b867e7b7e3521f4e3ba *vp90-2-13-largescaling.webm
+bca1b02eebdb088fa3f389fe0e7571e75a71f523 *vp90-2-13-largescaling.webm.md5
+c740708fa390806eebaf669909c1285ab464f886 *vp90-2-14-resize-fp-tiles-1-2.webm
+c7b85ffd8e11500f73f52e7dc5a47f57c393d47f *vp90-2-14-resize-fp-tiles-1-2.webm.md5
+ec8faa352a08f7033c60f29f80d505e2d7daa103 *vp90-2-14-resize-fp-tiles-1-4.webm
+6852c783fb421bda5ded3d4c5a3ffc46de03fbc1 *vp90-2-14-resize-fp-tiles-1-4.webm.md5
+8af61853ac0d07c4cb5bf7c2016661ba350b3497 *vp90-2-14-resize-fp-tiles-1-8.webm
+571353bac89fea60b5706073409aa3c0d42aefe9 *vp90-2-14-resize-fp-tiles-1-8.webm.md5
+b1c187ed69931496b82ec194017a79831bafceef *vp90-2-14-resize-fp-tiles-1-16.webm
+1c199a41afe42ce303944d70089eaaa2263b4a09 *vp90-2-14-resize-fp-tiles-1-16.webm.md5
+8eaae5a6f2dff934610b0c7a917d7f583ba74aa5 *vp90-2-14-resize-fp-tiles-2-1.webm
+db18fcf915f7ffaea6c39feab8bda6c1688af011 *vp90-2-14-resize-fp-tiles-2-1.webm.md5
+bc3046d138941e2a20e9ceec0ff6d25c25d12af3 *vp90-2-14-resize-fp-tiles-4-1.webm
+393211b808030d09a79927b17a4374b2f68a60ae *vp90-2-14-resize-fp-tiles-4-1.webm.md5
+6e8f8e31721a0f7f68a2964e36e0e698c2e276b1 *vp90-2-14-resize-fp-tiles-8-1.webm
+491fd3cd78fb0577bfe905bb64bbf64bd7d29140 *vp90-2-14-resize-fp-tiles-8-1.webm.md5
+cc5958da2a7edf739cd2cfeb18bd05e77903087e *vp90-2-14-resize-fp-tiles-16-1.webm
+0b58daf55aaf9063bf5b4fb33393d18b417dc428 *vp90-2-14-resize-fp-tiles-16-1.webm.md5
+821eeecc9d8c6a316134dd42d1ff057787d8047b *vp90-2-14-resize-fp-tiles-2-4.webm
+374c549f2839a3d0b732c4e3650700144037e76c *vp90-2-14-resize-fp-tiles-2-4.webm.md5
+dff8c8e49aacea9f4c7f22cb882da984e2a1b405 *vp90-2-14-resize-fp-tiles-2-8.webm
+e5b8820a7c823b21297d6e889e57ec401882c210 *vp90-2-14-resize-fp-tiles-2-8.webm.md5
+77629e4b23e32896aadf6e994c78bd4ffa1c7797 *vp90-2-14-resize-fp-tiles-2-16.webm
+1937f5df032664ac345d4613ad4417b4967b1230 *vp90-2-14-resize-fp-tiles-2-16.webm.md5
+380ba5702bb1ec7947697314ab0300b5c56a1665 *vp90-2-14-resize-fp-tiles-4-2.webm
+fde7b30d2aa64c1e851a4852f655d79fc542cf66 *vp90-2-14-resize-fp-tiles-4-2.webm.md5
+dc784b258ffa2abc2ae693d11792acf0bb9cb74f *vp90-2-14-resize-fp-tiles-8-2.webm
+edf26f0130aeee8342d49c2c8f0793ad008782d9 *vp90-2-14-resize-fp-tiles-8-2.webm.md5
+8e575789fd63ebf69e8eff1b9a4351a249a73bee *vp90-2-14-resize-fp-tiles-16-2.webm
+b6415318c1c589a1f64b9d569ce3cabbec2e0d52 *vp90-2-14-resize-fp-tiles-16-2.webm.md5
+e3adc944a11c4c5517e63664c84ebb0847b64d81 *vp90-2-14-resize-fp-tiles-4-8.webm
+03cba0532bc90a05b1990db830bf5701e24e7982 *vp90-2-14-resize-fp-tiles-4-8.webm.md5
+3b27a991eb6d78dce38efab35b7db682e8cbbee3 *vp90-2-14-resize-fp-tiles-4-16.webm
+5d16b7f82bf59f802724ddfd97abb487150b1c9d *vp90-2-14-resize-fp-tiles-4-16.webm.md5
+d5fed8c28c1d4c7e232ebbd25cf758757313ed96 *vp90-2-14-resize-fp-tiles-8-4.webm
+5a8ff8a52cbbde7bfab569beb6d971c5f8b904f7 *vp90-2-14-resize-fp-tiles-8-4.webm.md5
+17a5faa023d77ee9dad423a4e0d3145796bbc500 *vp90-2-14-resize-fp-tiles-16-4.webm
+2ef8daa3c3e750fd745130d0a76a39fe86f0448f *vp90-2-14-resize-fp-tiles-16-4.webm.md5
+9361e031f5cc990d8740863e310abb5167ae351e *vp90-2-14-resize-fp-tiles-8-16.webm
+57f13a2197486584f4e1a4f82ad969f3abc5a1a2 *vp90-2-14-resize-fp-tiles-8-16.webm.md5
+5803fc6fcbfb47b7661f3fcc6499158a32b56675 *vp90-2-14-resize-fp-tiles-16-8.webm
+be0fe64a1a4933696ff92d93f9bdecdbd886dc13 *vp90-2-14-resize-fp-tiles-16-8.webm.md5
+0ac0f6d20a0afed77f742a3b9acb59fd7b9cb093 *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
+1765315acccfe6cd12230e731369fcb15325ebfa *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
+4a2b7a683576fe8e330c7d1c4f098ff4e70a43a8 *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+1ef480392112b3509cb190afbb96f9a38dd9fbac *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+e615575ded499ea1d992f3b38e3baa434509cdcd *vp90-2-15-segkey.webm
+e3ab35d4316c5e81325c50f5236ceca4bc0d35df *vp90-2-15-segkey.webm.md5
+9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d *vp90-2-15-segkey_adpq.webm
+8f46ba5f785d0c2170591a153e0d0d146a7c8090 *vp90-2-15-segkey_adpq.webm.md5
+698a6910a97486b833073ef0c0b18d75dce57ee8 *vp90-2-16-intra-only.webm
+5661b0168752969f055eec37b05fa9fa947dc7eb *vp90-2-16-intra-only.webm.md5
+c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa *vp90-2-17-show-existing-frame.webm
+cc75f351818b9a619818f5cc77b9bc013d0c1e11 *vp90-2-17-show-existing-frame.webm.md5
+013708bd043f0821a3e56fb8404d82e7a0c7af6c *vp91-2-04-yuv422.webm
+1e58a7d23adad830a672f1733c9d2ae17890d59c *vp91-2-04-yuv422.webm.md5
+25d78f28948789d159a9453ebc13048b818251b1 *vp91-2-04-yuv440.webm
+81b3870b27a7f695ef6a43e87ab04bbdb5aee2f5 *vp91-2-04-yuv440.webm.md5
+0321d507ce62dedc8a51b4e9011f7a19aed9c3dc *vp91-2-04-yuv444.webm
+367e423dd41fdb49aa028574a2cfec5c2f325c5c *vp91-2-04-yuv444.webm.md5
+f77673b566f686853adefe0c578ad251b7241281 *vp92-2-20-10bit-yuv420.webm
+abdedfaddacbbe1a15ac7a54e86360f03629fb7a *vp92-2-20-10bit-yuv420.webm.md5
+0c2c355a1b17b28537c5a3b19997c8783b69f1af *vp92-2-20-12bit-yuv420.webm
+afb2c2798703e039189b0a15c8ac5685aa51d33f *vp92-2-20-12bit-yuv420.webm.md5
+0d661bc6e83da33238981481efd1b1802d323d88 *vp93-2-20-10bit-yuv422.webm
+10318907063db22eb02fad332556edbbecd443cc *vp93-2-20-10bit-yuv422.webm.md5
+ebc6be2f7511a0bdeac0b18c67f84ba7168839c7 *vp93-2-20-12bit-yuv422.webm
+235232267c6a1dc8a11e45d600f1c99d2f8b42d4 *vp93-2-20-12bit-yuv422.webm.md5
+f76b11b26d4beaceac7a7e7729dd5054d095164f *vp93-2-20-10bit-yuv440.webm
+757b33b5ac969c5999999488a731a3d1e6d9fb88 *vp93-2-20-10bit-yuv440.webm.md5
+df8807dbd29bec795c2db9c3c18e511fbb988101 *vp93-2-20-12bit-yuv440.webm
+ea4100930c3f59a1c23fbb33ab0ea01151cae159 *vp93-2-20-12bit-yuv440.webm.md5
+189c1b5f404ff41a50a7fc96341085ad541314a9 *vp93-2-20-10bit-yuv444.webm
+2dd0177c2f9d970b6e698892634c653630f91f40 *vp93-2-20-10bit-yuv444.webm.md5
+bd44cf6e1c27343e3639df9ac21346aedd5d6973 *vp93-2-20-12bit-yuv444.webm
+f36e5bdf5ec3213f32c0ddc82f95d82c5133bf27 *vp93-2-20-12bit-yuv444.webm.md5
+eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
+c77e4a26616add298a05dd5d12397be22c0e40c5 *vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce *vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
+b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+359e138dfb66863828397b77000ea7a83c844d02 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf
+bbd33de01c17b165b4ce00308e8a19a942023ab8 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+bcdedaf168ac225575468fda77502d2dc9fd5baa *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+65e93f9653bcf65b022f7d225268d1a90a76e7bb *vp90-2-19-skip.webm
+368dccdde5288c13c25695d2eacdc7402cadf613 *vp90-2-19-skip.webm.md5
+ffe460282df2b0e7d4603c2158653ad96f574b02 *vp90-2-19-skip-01.webm
+bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 *vp90-2-19-skip-01.webm.md5
+178f5bd239e38cc1cc2657a7a5e1a9f52ad2d3fe *vp90-2-19-skip-02.webm
+9020d5e260bd7df08e2b3d4b86f8623cee3daea2 *vp90-2-19-skip-02.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109 *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+552e372e9b78127389fb06b34545df2cec15ba6d *invalid-vp91-2-mixedrefcsp-444to420.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+812d05a64a0d83c1b504d0519927ddc5a2cdb273 *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+f97088c7359fc8d3d5aa5eafe57bc7308b3ee124 *vp90-2-20-big_superframe-01.webm
+47d7d409785afa33b123376de0c907336e6c7bd7 *vp90-2-20-big_superframe-01.webm.md5
+65ade6d2786209582c50d34cfe22b3cdb033abaf *vp90-2-20-big_superframe-02.webm
+7c0ed8d04c4d06c5411dd2e5de2411d37f092db5 *vp90-2-20-big_superframe-02.webm.md5
+667ec8718c982aef6be07eb94f083c2efb9d2d16 *vp90-2-07-frame_parallel-1.webm
+bfc82bf848e9c05020d61e3ffc1e62f25df81d19 *vp90-2-07-frame_parallel-1.webm.md5
+efd5a51d175cfdacd169ed23477729dc558030dc *invalid-vp90-2-07-frame_parallel-1.webm
+9f912712ec418be69adb910e2ca886a63c4cec08 *invalid-vp90-2-07-frame_parallel-2.webm
+445f5a53ca9555341852997ccdd480a51540bd14 *invalid-vp90-2-07-frame_parallel-3.webm
+d18c90709a0d03c82beadf10898b27d88fff719c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf
+d06285d109ecbaef63b0cbcc44d70a129186f51c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res
+e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
+0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
+9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m

diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 0814c2b..6bb08be 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk

@@ -22,27 +22,36 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
-LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 
+## IVF writing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../ivfenc.c ../ivfenc.h
+
 ## Y4m parsing.
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
 
@@ -57,10 +66,10 @@
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
 
 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -82,6 +91,7 @@
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
@@ -89,20 +99,25 @@
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
+endif
+
 endif # VP8
 
 ## VP9
@@ -116,7 +131,7 @@
 LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
-
+LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
@@ -128,734 +143,30 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
+
 endif
 
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+endif
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
+
 endif # VP9
 
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 
+TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
+
 endif # CONFIG_SHARED
 
-
-##
-## TEST DATA
-##
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
-
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
-
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
-
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
-
-# Invalid files for testing libvpx error checking.
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
-
-ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
-# BBB VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
-#Sintel VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
-# TOS VP9 streams
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_426x178_tile_1x1_181kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_640x266_tile_1x2_336kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_854x356_tile_1x2_656kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
-  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
-endif  # CONFIG_DECODE_PERF_TESTS
-
-ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
-endif  # CONFIG_ENCODE_PERF_TESTS
+include $(SRC_PATH_BARE)/test/test-data.mk

diff --git a/libvpx/test/test_intra_pred_speed.cc b/libvpx/test/test_intra_pred_speed.cc
new file mode 100644
index 0000000..5d59e83
--- /dev/null
+++ b/libvpx/test/test_intra_pred_speed.cc

@@ -0,0 +1,384 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+//  Test and time VPX intra-predictor functions
+
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/md5_helper.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+                            const uint8_t *above, const uint8_t *left);
+
+const int kNumVp9IntraPredFuncs = 13;
+const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", "H_PRED",
+  "D45_PRED", "D135_PRED", "D117_PRED", "D153_PRED", "D207_PRED", "D63_PRED",
+  "TM_PRED"
+};
+
+void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
+                   const char *const pred_func_names[], int num_funcs,
+                   const char *const signatures[], int block_size,
+                   int num_pixels_per_test) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int kBPS = 32;
+  const int kTotalPixels = 32 * kBPS;
+  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
+  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
+  uint8_t *const above = above_mem + 16;
+  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
+  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
+  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+
+  // some code assumes the top row has been extended:
+  // d45/d63 C-code, for instance, but not the assembly.
+  // TODO(jzern): this style of extension isn't strictly necessary.
+  ASSERT_LE(block_size, kBPS);
+  memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
+
+  for (int k = 0; k < num_funcs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(src, ref_src, sizeof(src));
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](src, kBPS, above, left);
+    }
+    libvpx_test::ClearSystemState();
+    vpx_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+    libvpx_test::MD5 md5;
+    md5.Add(src, sizeof(src));
+    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
+           elapsed_time, md5.Get());
+    EXPECT_STREQ(signatures[k], md5.Get());
+  }
+}
+
+void TestIntraPred4(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "4334156168b34ab599d9b5b30f522fe9",
+    "bc4649d5ba47c7ff178d92e475960fb0",
+    "8d316e5933326dcac24e1064794b5d12",
+    "a27270fed024eafd762c95de85f4da51",
+    "c33dff000d4256c2b8f3bf9e9bab14d2",
+    "44d8cddc2ad8f79b8ed3306051722b4f",
+    "eb54839b2bad6699d8946f01ec041cd0",
+    "ecb0d56ae5f677ea45127ce9d5c058e4",
+    "0b7936841f6813da818275944895b574",
+    "9117972ef64f91a58ff73e1731c81db2",
+    "c56d5e8c729e46825f46dd5d3b5d508a",
+    "c0889e2039bcf7bcb5d2f33cdca69adc",
+    "309a618577b27c648f9c5ee45252bc8f",
+  };
+  TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred8(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "7694ddeeefed887faf9d339d18850928",
+    "7d726b1213591b99f736be6dec65065b",
+    "19c5711281357a485591aaf9c96c0a67",
+    "ba6b66877a089e71cd938e3b8c40caac",
+    "802440c93317e0f8ba93fab02ef74265",
+    "9e09a47a15deb0b9d8372824f9805080",
+    "b7c2d8c662268c0c427da412d7b0311d",
+    "78339c1c60bb1d67d248ab8c4da08b7f",
+    "5c97d70f7d47de1882a6cd86c165c8a9",
+    "8182bf60688b42205acd95e59e967157",
+    "08323400005a297f16d7e57e7fe1eaac",
+    "95f7bfc262329a5849eda66d8f7c68ce",
+    "815b75c8e0d91cc1ae766dc5d3e445a3",
+  };
+  TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred16(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "b40dbb555d5d16a043dc361e6694fe53",
+    "fb08118cee3b6405d64c1fd68be878c6",
+    "6c190f341475c837cc38c2e566b64875",
+    "db5c34ccbe2c7f595d9b08b0dc2c698c",
+    "a62cbfd153a1f0b9fed13e62b8408a7a",
+    "143df5b4c89335e281103f610f5052e4",
+    "d87feb124107cdf2cfb147655aa0bb3c",
+    "7841fae7d4d47b519322e6a03eeed9dc",
+    "f6ebed3f71cbcf8d6d0516ce87e11093",
+    "3cc480297dbfeed01a1c2d78dd03d0c5",
+    "b9f69fa6532b372c545397dcb78ef311",
+    "a8fe1c70432f09d0c20c67bdb6432c4d",
+    "b8a41aa968ec108af447af4217cba91b",
+  };
+  TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred32(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "558541656d84f9ae7896db655826febe",
+    "b3587a1f9a01495fa38c8cd3c8e2a1bf",
+    "4c6501e64f25aacc55a2a16c7e8f0255",
+    "b3b01379ba08916ef6b1b35f7d9ad51c",
+    "0f1eb38b6cbddb3d496199ef9f329071",
+    "911c06efb9ed1c3b4c104b232b55812f",
+    "9225beb0ddfa7a1d24eaa1be430a6654",
+    "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
+    "b01c9076525216925f3456f034fb6eee",
+    "d267e20ad9e5cd2915d1a47254d3d149",
+    "ed012a4a5da71f36c2393023184a0e59",
+    "f162b51ed618d28b936974cff4391da5",
+    "9e1370c6d42e08d357d9612c93a71cfc",
+  };
+  TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+}
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, v, h, \
+                        d45, d135, d117, d153, d207, d63, tm)               \
+  TEST(arch, test_func) {                                                   \
+    static const VpxPredFunc vpx_intra_pred[] = {                           \
+        dc,   dc_left, dc_top, dc_128, v,   h, d45,                         \
+        d135, d117,    d153,   d207,   d63, tm};                            \
+    test_func(vpx_intra_pred);                                              \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4
+
+INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
+                vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c,
+                vpx_dc_128_predictor_4x4_c, vpx_v_predictor_4x4_c,
+                vpx_h_predictor_4x4_c, vpx_d45_predictor_4x4_c,
+                vpx_d135_predictor_4x4_c, vpx_d117_predictor_4x4_c,
+                vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
+                vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
+
+#if HAVE_SSE && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
+                vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
+                vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
+#endif  // HAVE_SSE && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
+                NULL, vpx_d153_predictor_4x4_ssse3,
+                vpx_d207_predictor_4x4_ssse3, vpx_d63_predictor_4x4_ssse3, NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
+                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
+                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
+                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
+                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
+                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
+                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
+                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_4x4_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 8x8
+
+INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
+                vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c,
+                vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c,
+                vpx_h_predictor_8x8_c, vpx_d45_predictor_8x8_c,
+                vpx_d135_predictor_8x8_c, vpx_d117_predictor_8x8_c,
+                vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
+                vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
+
+#if HAVE_SSE && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE, TestIntraPred8, vpx_dc_predictor_8x8_sse,
+                vpx_dc_left_predictor_8x8_sse, vpx_dc_top_predictor_8x8_sse,
+                vpx_dc_128_predictor_8x8_sse, vpx_v_predictor_8x8_sse, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE && CONFIG_USE_X86INC
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_8x8_ssse3, vpx_d45_predictor_8x8_ssse3, NULL,
+                NULL, vpx_d153_predictor_8x8_ssse3,
+                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_8x8_c)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
+                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
+                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
+                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
+
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
+                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
+                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
+                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_8x8_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 16x16
+
+INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
+                vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c,
+                vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c,
+                vpx_h_predictor_16x16_c, vpx_d45_predictor_16x16_c,
+                vpx_d135_predictor_16x16_c, vpx_d117_predictor_16x16_c,
+                vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
+                vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
+                vpx_dc_left_predictor_16x16_sse2,
+                vpx_dc_top_predictor_16x16_sse2,
+                vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_16x16_ssse3, vpx_d45_predictor_16x16_ssse3,
+                NULL, NULL, vpx_d153_predictor_16x16_ssse3,
+                vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
+                NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
+                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
+                vpx_dc_left_predictor_16x16_neon,
+                vpx_dc_top_predictor_16x16_neon,
+                vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
+                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
+                vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
+                vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
+                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_16x16_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 32x32
+
+INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
+                vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
+                vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
+                vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
+                vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
+                vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
+                vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+#if ARCH_X86_64
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
+                vpx_dc_left_predictor_32x32_sse2,
+                vpx_dc_top_predictor_32x32_sse2,
+                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_sse2)
+#else
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
+                vpx_dc_left_predictor_32x32_sse2,
+                vpx_dc_top_predictor_32x32_sse2,
+                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_32x32_ssse3, vpx_d45_predictor_32x32_ssse3,
+                NULL, NULL, vpx_d153_predictor_32x32_ssse3,
+                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
+                NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
+                vpx_dc_left_predictor_32x32_neon,
+                vpx_dc_top_predictor_32x32_neon,
+                vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
+                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
+                vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
+                vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
+                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_msa)
+#endif  // HAVE_MSA
+
+#include "test/test_libvpx.cc"

diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc
index fc035af..2649917 100644
--- a/libvpx/test/test_libvpx.cc
+++ b/libvpx/test/test_libvpx.cc

@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
 #include "./vpx_config.h"
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -15,12 +18,13 @@
 extern "C" {
 #if CONFIG_VP8
 extern void vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
 extern void vp9_rtcd();
-#endif
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
 }
-#include "third_party/googletest/src/include/gtest/gtest.h"
 
 static void append_negative_gtest_filter(const char *str) {
   std::string filter = ::testing::FLAGS_gtest_filter;
@@ -36,21 +40,21 @@
 #if ARCH_X86 || ARCH_X86_64
   const int simd_caps = x86_simd_caps();
   if (!(simd_caps & HAS_MMX))
-    append_negative_gtest_filter(":MMX/*");
+    append_negative_gtest_filter(":MMX.*:MMX/*");
   if (!(simd_caps & HAS_SSE))
-    append_negative_gtest_filter(":SSE/*");
+    append_negative_gtest_filter(":SSE.*:SSE/*");
   if (!(simd_caps & HAS_SSE2))
-    append_negative_gtest_filter(":SSE2/*");
+    append_negative_gtest_filter(":SSE2.*:SSE2/*");
   if (!(simd_caps & HAS_SSE3))
-    append_negative_gtest_filter(":SSE3/*");
+    append_negative_gtest_filter(":SSE3.*:SSE3/*");
   if (!(simd_caps & HAS_SSSE3))
-    append_negative_gtest_filter(":SSSE3/*");
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
   if (!(simd_caps & HAS_SSE4_1))
-    append_negative_gtest_filter(":SSE4_1/*");
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
   if (!(simd_caps & HAS_AVX))
-    append_negative_gtest_filter(":AVX/*");
+    append_negative_gtest_filter(":AVX.*:AVX/*");
   if (!(simd_caps & HAS_AVX2))
-    append_negative_gtest_filter(":AVX2/*");
+    append_negative_gtest_filter(":AVX2.*:AVX2/*");
 #endif
 
 #if !CONFIG_SHARED
@@ -59,11 +63,13 @@
 
 #if CONFIG_VP8
   vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
   vp9_rtcd();
-#endif
-#endif
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
 
   return RUN_ALL_TESTS();
 }

diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index 1f294f2..437ce44 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc

@@ -12,6 +12,7 @@
 #include <cstdlib>
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -26,10 +27,24 @@
 
 namespace {
 
+enum DecodeMode {
+  kSerialMode,
+  kFrameParallelMode
+};
+
+const int kDecodeMode = 0;
+const int kThreads = 1;
+const int kFileName = 2;
+
+typedef std::tr1::tuple<int, int, const char*> DecodeParam;
+
 class TestVectorTest : public ::libvpx_test::DecoderTest,
-    public ::libvpx_test::CodecTestWithParam<const char*> {
+    public ::libvpx_test::CodecTestWithParam<DecodeParam> {
  protected:
-  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+  TestVectorTest()
+      : DecoderTest(GET_PARAM(0)),
+        md5_file_(NULL) {
+  }
 
   virtual ~TestVectorTest() {
     if (md5_file_)
@@ -71,8 +86,25 @@
 // checksums match the correct md5 data, then the test is passed. Otherwise,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
-  const std::string filename = GET_PARAM(1);
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = std::tr1::get<kFileName>(input);
+  const int threads = std::tr1::get<kThreads>(input);
+  const int mode = std::tr1::get<kDecodeMode>(input);
   libvpx_test::CompressedVideoSource *video = NULL;
+  vpx_codec_flags_t flags = 0;
+  vpx_codec_dec_cfg_t cfg = {0};
+  char str[256];
+
+  if (mode == kFrameParallelMode) {
+    flags |= VPX_CODEC_USE_FRAME_THREADING;
+  }
+
+  cfg.threads = threads;
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s  mode: %s threads: %d",
+           filename.c_str(), mode == 0 ? "Serial" : "Parallel", threads);
+  SCOPED_TRACE(str);
 
   // Open compressed video file.
   if (filename.substr(filename.length() - 3, 3) == "ivf") {
@@ -92,18 +124,50 @@
   const std::string md5_filename = filename + ".md5";
   OpenMD5File(md5_filename);
 
+  // Set decode config and flags.
+  set_cfg(cfg);
+  set_flags(flags);
+
   // Decode frame, and check the md5 matching.
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video, cfg));
   delete video;
 }
 
-VP8_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
-                                              libvpx_test::kVP8TestVectors +
-                                              libvpx_test::kNumVP8TestVectors));
-VP9_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
-                                              libvpx_test::kVP9TestVectors +
-                                              libvpx_test::kNumVP9TestVectors));
+// Test VP8 decode in serial mode with single thread.
+// NOTE: VP8 only support serial mode.
+#if CONFIG_VP8_DECODER
+VP8_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(0),  // Serial Mode.
+        ::testing::Values(1),  // Single thread.
+        ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
+                            libvpx_test::kVP8TestVectors +
+                                libvpx_test::kNumVP8TestVectors)));
+#endif  // CONFIG_VP8_DECODER
 
+// Test VP9 decode in serial mode with single thread.
+#if CONFIG_VP9_DECODER
+VP9_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(0),  // Serial Mode.
+        ::testing::Values(1),  // Single thread.
+        ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                            libvpx_test::kVP9TestVectors +
+                                libvpx_test::kNumVP9TestVectors)));
+
+// Test VP9 decode in frame parallel mode with different number of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP9MultiThreadedFrameParallel, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(1),        // Frame Parallel mode.
+            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                libvpx_test::kVP9TestVectors +
+                                    libvpx_test::kNumVP9TestVectors))));
+#endif
 }  // namespace

diff --git a/libvpx/test/test_vectors.cc b/libvpx/test/test_vectors.cc
index dbdbdd6..434a382 100644
--- a/libvpx/test/test_vectors.cc
+++ b/libvpx/test/test_vectors.cc

@@ -165,7 +165,10 @@
   "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
   "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
   "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
+#if !CONFIG_SIZE_LIMIT || \
+    (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120)
   "vp90-2-13-largescaling.webm",
+#endif
   "vp90-2-14-resize-fp-tiles-1-16.webm",
   "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
   "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
@@ -181,7 +184,17 @@
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
   "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
-  "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm",
+  "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
+  "vp90-2-19-skip-01.webm", "vp90-2-19-skip-02.webm",
+  "vp91-2-04-yuv444.webm",
+  "vp91-2-04-yuv422.webm", "vp91-2-04-yuv440.webm",
+#if CONFIG_VP9_HIGHBITDEPTH
+  "vp92-2-20-10bit-yuv420.webm", "vp92-2-20-12bit-yuv420.webm",
+  "vp93-2-20-10bit-yuv422.webm", "vp93-2-20-12bit-yuv422.webm",
+  "vp93-2-20-10bit-yuv440.webm", "vp93-2-20-12bit-yuv440.webm",
+  "vp93-2-20-10bit-yuv444.webm", "vp93-2-20-12bit-yuv444.webm",
+#endif  // CONFIG_VP9_HIGHBITDEPTH`
+  "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER

diff --git a/libvpx/test/tile_independence_test.cc b/libvpx/test/tile_independence_test.cc
index d714452..193bd45 100644
--- a/libvpx/test/tile_independence_test.cc
+++ b/libvpx/test/tile_independence_test.cc

@@ -29,7 +29,7 @@
         md5_inv_order_(),
         n_tiles_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    vpx_codec_dec_cfg_t cfg;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.w = 704;
     cfg.h = 144;
     cfg.threads = 1;
@@ -104,4 +104,5 @@
 
 VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 }  // namespace

diff --git a/libvpx/test/tools_common.sh b/libvpx/test/tools_common.sh
index 0bfefba..0bdcc08 100755
--- a/libvpx/test/tools_common.sh
+++ b/libvpx/test/tools_common.sh

@@ -106,22 +106,24 @@
   fi
 }
 
+# $1 is the name of an environment variable containing a directory name to
+# test.
+test_env_var_dir() {
+  local dir=$(eval echo "\${$1}")
+  if [ ! -d "${dir}" ]; then
+    elog "'${dir}': No such directory"
+    elog "The $1 environment variable must be set to a valid directory."
+    return 1
+  fi
+}
+
 # This script requires that the LIBVPX_BIN_PATH, LIBVPX_CONFIG_PATH, and
 # LIBVPX_TEST_DATA_PATH variables are in the environment: Confirm that
 # the variables are set and that they all evaluate to directory paths.
 verify_vpx_test_environment() {
-  if [ ! -d "${LIBVPX_BIN_PATH}" ]; then
-    echo "The LIBVPX_BIN_PATH environment variable must be set."
-    return 1
-  fi
-  if [ ! -d "${LIBVPX_CONFIG_PATH}" ]; then
-    echo "The LIBVPX_CONFIG_PATH environment variable must be set."
-    return 1
-  fi
-  if [ ! -d "${LIBVPX_TEST_DATA_PATH}" ]; then
-    echo "The LIBVPX_TEST_DATA_PATH environment variable must be set."
-    return 1
-  fi
+  test_env_var_dir "LIBVPX_BIN_PATH" \
+    && test_env_var_dir "LIBVPX_CONFIG_PATH" \
+    && test_env_var_dir "LIBVPX_TEST_DATA_PATH"
 }
 
 # Greps vpx_config.h in LIBVPX_CONFIG_PATH for positional parameter one, which
@@ -144,6 +146,24 @@
   fi
 }
 
+# Echoes path to $1 when it's executable and exists in ${LIBVPX_BIN_PATH}, or an
+# empty string. Caller is responsible for testing the string once the function
+# returns.
+vpx_tool_path() {
+  local readonly tool_name="$1"
+  local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  if [ ! -x "${tool_path}" ]; then
+    # Try one directory up: when running via examples.sh the tool could be in
+    # the parent directory of $LIBVPX_BIN_PATH.
+    tool_path="${LIBVPX_BIN_PATH}/../${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  fi
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
 # Echoes yes to stdout when the file named by positional parameter one exists
 # in LIBVPX_BIN_PATH, and is executable.
 vpx_tool_available() {
@@ -182,11 +202,11 @@
   [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
 }
 
-# Filters strings from positional parameter one using the filter specified by
-# positional parameter two. Filter behavior depends on the presence of a third
-# positional parameter. When parameter three is present, strings that match the
-# filter are excluded. When omitted, strings matching the filter are included.
-# The filtered string is echoed to stdout.
+# Filters strings from $1 using the filter specified by $2. Filter behavior
+# depends on the presence of $3. When $3 is present, strings that match the
+# filter are excluded. When $3 is omitted, strings matching the filter are
+# included.
+# The filtered result is echoed to stdout.
 filter_strings() {
   strings=${1}
   filter=${2}
@@ -235,6 +255,18 @@
     tests_to_filter=$(filter_strings "${tests_to_filter}" ${VPX_TEST_FILTER})
   fi
 
+  # User requested test listing: Dump test names and return.
+  if [ "${VPX_TEST_LIST_TESTS}" = "yes" ]; then
+    for test_name in $tests_to_filter; do
+      echo ${test_name}
+    done
+    return
+  fi
+
+  # Don't bother with the environment tests if everything else was disabled.
+  [ -z "${tests_to_filter}" ] && return
+
+  # Combine environment and actual tests.
   local tests_to_run="${env_tests} ${tests_to_filter}"
 
   check_git_hashes
@@ -265,6 +297,7 @@
     --prefix: Allows for a user specified prefix to be inserted before all test
               programs. Grants the ability, for example, to run test programs
               within valgrind.
+    --list-tests: List all test names and exit without actually running tests.
     --verbose: Verbose output.
 
     When the --bin-path option is not specified the script attempts to use
@@ -324,6 +357,9 @@
     --show-program-output)
       devnull=
       ;;
+    --list-tests)
+      VPX_TEST_LIST_TESTS=yes
+      ;;
     *)
       vpx_test_usage
       exit 1
@@ -347,8 +383,7 @@
   VPX_TEST_TEMP_ROOT=/tmp
 fi
 
-VPX_TEST_RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
-VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_${VPX_TEST_RAND}"
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
 
 if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
    [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
@@ -366,11 +401,16 @@
 VP9_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf"
 
 VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
+VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
 
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352
 YUV_RAW_INPUT_HEIGHT=288
 
+Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT
 
@@ -383,15 +423,16 @@
   VP9_WEBM_FILE=${VP9_WEBM_FILE}
   VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX}
   VPX_TEST_FILTER=${VPX_TEST_FILTER}
+  VPX_TEST_LIST_TESTS=${VPX_TEST_LIST_TESTS}
   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
   VPX_TEST_PREFIX=${VPX_TEST_PREFIX}
-  VPX_TEST_RAND=${VPX_TEST_RAND}
   VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
   VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}
   VPX_TEST_TEMP_ROOT=${VPX_TEST_TEMP_ROOT}
   VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
   YUV_RAW_INPUT=${YUV_RAW_INPUT}
   YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
-  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}"
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+  Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"
 
 fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.

diff --git a/libvpx/test/user_priv_test.cc b/libvpx/test/user_priv_test.cc
index 22fce85..8512d88 100644
--- a/libvpx/test/user_priv_test.cc
+++ b/libvpx/test/user_priv_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
   libvpx_test::MD5 md5;

diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index 7d81182..7a34db6 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc

@@ -7,86 +7,398 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdlib.h>
+
+#include <cstdlib>
 #include <new>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
-
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
-#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
-#if CONFIG_VP8_ENCODER
-# include "./vp8_rtcd.h"
-# include "vp8/common/variance.h"
-#endif
-#if CONFIG_VP9_ENCODER
-# include "./vp9_rtcd.h"
-# include "vp9/encoder/vp9_variance.h"
-#endif
-#include "test/acm_random.h"
+#include "vpx_ports/mem.h"
 
 namespace {
 
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
+                                        unsigned int *sse);
+typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse,
+                                            const uint8_t *second_pred);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+
 using ::std::tr1::get;
 using ::std::tr1::make_tuple;
 using ::std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
-static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
-                                 int l2w, int l2h, unsigned int *sse_ptr) {
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      int diff = ref[w * y + x] - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
-    }
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+  switch (bit_depth) {
+    case VPX_BITS_12:
+      *sse = (*sse + 128) >> 8;
+      *se = (*se + 8) >> 4;
+      break;
+    case VPX_BITS_10:
+      *sse = (*sse + 8) >> 4;
+      *se = (*se + 2) >> 2;
+      break;
+    case VPX_BITS_8:
+    default:
+      break;
   }
-  *sse_ptr = sse;
-  return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
 
-static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
-                                        int l2w, int l2h, int xoff, int yoff,
-                                        unsigned int *sse_ptr) {
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
+                             int l2w, int l2h, int src_stride_coeff,
+                             int ref_stride_coeff, uint32_t *sse_ptr,
+                             bool use_high_bit_depth_,
+                             vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
-      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-      const int r = a + (((b - a) * yoff + 8) >> 4);
-      int diff = r - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
+      int diff;
+      if (!use_high_bit_depth_) {
+        diff = ref[w * y * ref_stride_coeff + x] -
+               src[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
   }
-  *sse_ptr = sse;
-  return sse - (((int64_t) se * se) >> (l2w + l2h));
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
+}
+
+/* The subpel reference functions differ from the codec version in one aspect:
+ * they calculate the bilinear factors directly instead of using a lookup table
+ * and therefore upshift xoff and yoff by 1. Only every other calculated value
+ * is used so the codec version shrinks the table to save space and maintain
+ * compatibility with vp8.
+ */
+static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                    int l2w, int l2h, int xoff, int yoff,
+                                    uint32_t *sse_ptr,
+                                    bool use_high_bit_depth_,
+                                    vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
+}
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
 }
 
 template<typename VarianceFunctionType>
 class VarianceTest
-    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            VarianceFunctionType, int> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
+    const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
     log2width_  = get<0>(params);
     width_ = 1 << log2width_;
     log2height_ = get<1>(params);
     height_ = 1 << log2height_;
     variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = static_cast<vpx_bit_depth_t>(get<3>(params));
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_) - 1;
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+      ref_ = new uint8_t[block_size_ * 2];
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ZeroTest();
+  void RefTest();
+  void RefStrideTest();
+  void OneQuarterTest();
+
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  int width_, log2width_;
+  int height_, log2height_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  bool use_high_bit_depth_;
+  int block_size_;
+  VarianceFunctionType variance_;
+};
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::ZeroTest() {
+  for (int i = 0; i <= 255; ++i) {
+    if (!use_high_bit_depth_) {
+      memset(src_, i, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
+                   block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    for (int j = 0; j <= 255; ++j) {
+      if (!use_high_bit_depth_) {
+        memset(ref_, j, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j  << (bit_depth_ - 8),
+                     block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      unsigned int var;
+      ASM_REGISTER_STATE_CHECK(
+          var = variance_(src_, width_, ref_, width_, &sse));
+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
+    }
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+    if (!use_high_bit_depth_) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_, ref_, width_, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, stride_coeff,
+                                           stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefStrideTest() {
+  for (int i = 0; i < 10; ++i) {
+    int ref_stride_coeff = i % 2;
+    int src_stride_coeff = (i >> 1) % 2;
+    for (int j = 0; j < block_size_; j++) {
+      int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
+      int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
+      if (!use_high_bit_depth_) {
+        src_[src_ind] = rnd_.Rand8();
+        ref_[ref_ind] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_ * src_stride_coeff,
+                         ref_, width_ * ref_stride_coeff, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, src_stride_coeff,
+                                           ref_stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
+  const int half = block_size_ / 2;
+  if (!use_high_bit_depth_) {
+    memset(src_, 255, block_size_);
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
+                 block_size_);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+  unsigned int var;
+  ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
+  const unsigned int expected = block_size_ * 255 * 255 / 4;
+  EXPECT_EQ(expected, var);
+}
+
+template<typename MseFunctionType>
+class MseTest
+    : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, MseFunctionType>& params = this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    mse_ = get<2>(params);
 
     rnd(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
@@ -103,9 +415,10 @@
   }
 
  protected:
-  void ZeroTest();
-  void RefTest();
-  void OneQuarterTest();
+  void RefTest_mse();
+  void RefTest_sse();
+  void MaxTest_mse();
+  void MaxTest_sse();
 
   ACMRandom rnd;
   uint8_t* src_;
@@ -113,161 +426,287 @@
   int width_, log2width_;
   int height_, log2height_;
   int block_size_;
-  VarianceFunctionType variance_;
+  MseFunctionType mse_;
 };
 
-template<typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::ZeroTest() {
-  for (int i = 0; i <= 255; ++i) {
-    memset(src_, i, block_size_);
-    for (int j = 0; j <= 255; ++j) {
-      memset(ref_, j, block_size_);
-      unsigned int sse;
-      unsigned int var;
-      ASM_REGISTER_STATE_CHECK(
-          var = variance_(src_, width_, ref_, width_, &sse));
-      EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
-    }
-  }
-}
-
-template<typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::RefTest() {
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::RefTest_mse() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
       src_[j] = rnd.Rand8();
       ref_[j] = rnd.Rand8();
     }
     unsigned int sse1, sse2;
-    unsigned int var1;
-    ASM_REGISTER_STATE_CHECK(
-        var1 = variance_(src_, width_, ref_, width_, &sse1));
-    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
-                                           log2height_, &sse2);
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
     EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
   }
 }
 
-template<typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::RefTest_sse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+      src_[j] = rnd.Rand8();
+      ref_[j] = rnd.Rand8();
+    }
+    unsigned int sse2;
+    unsigned int var1;
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
+    EXPECT_EQ(var1, sse2);
+  }
+}
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::MaxTest_mse() {
   memset(src_, 255, block_size_);
-  const int half = block_size_ / 2;
-  memset(ref_, 255, half);
-  memset(ref_ + half, 0, half);
+  memset(ref_, 0, block_size_);
   unsigned int sse;
+  ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse));
+  const unsigned int expected = block_size_ * 255 * 255;
+  EXPECT_EQ(expected, sse);
+}
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::MaxTest_sse() {
+  memset(src_, 255, block_size_);
+  memset(ref_, 0, block_size_);
   unsigned int var;
-  ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
-  const unsigned int expected = block_size_ * 255 * 255 / 4;
+  ASM_REGISTER_STATE_CHECK(var = mse_(src_, width_, ref_, width_));
+  const unsigned int expected = block_size_ * 255 * 255;
   EXPECT_EQ(expected, var);
 }
 
-#if CONFIG_VP9_ENCODER
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref,
+                                        const uint8_t *src,
+                                        const uint8_t *second_pred,
+                                        int l2w, int l2h,
+                                        int xoff, int yoff,
+                                        uint32_t *sse_ptr,
+                                        bool use_high_bit_depth,
+                                        vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
 
-unsigned int subpel_avg_variance_ref(const uint8_t *ref,
-                                     const uint8_t *src,
-                                     const uint8_t *second_pred,
-                                     int l2w, int l2h,
-                                     int xoff, int yoff,
-                                     unsigned int *sse_ptr) {
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w, h = 1 << l2h;
+  xoff <<= 1;
+  yoff <<= 1;
+
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
       // bilinear interpolation at a 16th pel step
-      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-      const int r = a + (((b - a) * yoff + 8) >> 4);
-      int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        uint16_t *sec16   = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
   }
-  *sse_ptr = sse;
-  return sse - (((int64_t) se * se) >> (l2w + l2h));
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
 }
 
 template<typename SubpelVarianceFunctionType>
 class SubpelVarianceTest
     : public ::testing::TestWithParam<tuple<int, int,
-                                            SubpelVarianceFunctionType> > {
+                                            SubpelVarianceFunctionType, int> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType>& params =
+    const tuple<int, int, SubpelVarianceFunctionType, int>& params =
         this->GetParam();
     log2width_  = get<0>(params);
     width_ = 1 << log2width_;
     log2height_ = get<1>(params);
     height_ = 1 << log2height_;
     subpel_variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = (vpx_bit_depth_t) get<3>(params);
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_)-1;
 
-    rnd(ACMRandom::DeterministicSeed());
+    rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-    sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-    ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      sec_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size_ + width_ + height_ + 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(sec_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
-    vpx_free(src_);
-    delete[] ref_;
-    vpx_free(sec_);
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+      vpx_free(sec_);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+      vpx_free(CONVERT_TO_SHORTPTR(sec_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
     libvpx_test::ClearSystemState();
   }
 
  protected:
   void RefTest();
+  void ExtremeRefTest();
 
-  ACMRandom rnd;
+  ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
+  bool use_high_bit_depth_;
+  vpx_bit_depth_t bit_depth_;
   int width_, log2width_;
   int height_, log2height_;
-  int block_size_;
+  int block_size_,  mask_;
   SubpelVarianceFunctionType subpel_variance_;
 };
 
 template<typename SubpelVarianceFunctionType>
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
-  for (int x = 0; x < 16; ++x) {
-    for (int y = 0; y < 16; ++y) {
-      for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
-      }
-      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
                                                        src_, width_, &sse1));
-      const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
-                                                    log2height_, x, y, &sse2);
+      const unsigned int var2 = subpel_variance_ref(ref_, src_,
+                                                    log2width_, log2height_,
+                                                    x, y, &sse2,
+                                                    use_high_bit_depth_,
+                                                    bit_depth_);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
   }
 }
 
-template<>
-void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
-  for (int x = 0; x < 16; ++x) {
-    for (int y = 0; y < 16; ++y) {
-      for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
-        sec_[j] = rnd.Rand8();
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+  // Compare against reference.
+  // Src: Set the first half of values to 0, the second half to the maximum.
+  // Ref: Set the first half of values to the maximum, the second half to 0.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size_ / 2;
+      if (!use_high_bit_depth_) {
+        memset(src_, 0, half);
+        memset(src_ + half, 255, half);
+        memset(ref_, 255, half);
+        memset(ref_ + half, 0, half + width_ + height_ + 1);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
+                     half + width_ + height_ + 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
-      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
+      const unsigned int var2 =
+          subpel_variance_ref(ref_, src_, log2width_, log2height_,
+                              x, y, &sse2, use_high_bit_depth_, bit_depth_);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template<>
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -276,493 +715,1326 @@
                                   src_, width_, &sse1, sec_));
       const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
                                                         log2width_, log2height_,
-                                                        x, y, &sse2);
+                                                        x, y, &sse2,
+                                                        use_high_bit_depth_,
+                                                        bit_depth_);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
   }
 }
 
-#endif  // CONFIG_VP9_ENCODER
+typedef MseTest<Get4x4SseFunc> VpxSseTest;
+typedef MseTest<VarianceMxNFunc> VpxMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
 
-// -----------------------------------------------------------------------------
-// VP8 test cases.
+TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
+TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
+TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-namespace vp8 {
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_c));
 
-#if CONFIG_VP8_ENCODER
-typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
+const Get4x4SseFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c;
+INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
 
-TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP8VarianceTest, Ref) { RefTest(); }
-TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
+const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c;
+const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c;
+const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c;
+const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c;
+INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_c),
+                                          make_tuple(4, 3, mse16x8_c),
+                                          make_tuple(3, 4, mse8x16_c),
+                                          make_tuple(3, 3, mse8x8_c)));
 
-const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
-const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c;
-const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c;
-const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
-const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
+const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c;
+const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c;
+const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c;
+const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c;
+const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c;
+const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c;
+const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c;
+const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c;
+const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c;
+const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c;
+const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
+const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
+const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
 INSTANTIATE_TEST_CASE_P(
-    C, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c),
-                      make_tuple(3, 3, variance8x8_c),
-                      make_tuple(3, 4, variance8x16_c),
-                      make_tuple(4, 3, variance16x8_c),
-                      make_tuple(4, 4, variance16x16_c)));
+    C, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
+                      make_tuple(6, 5, variance64x32_c, 0),
+                      make_tuple(5, 6, variance32x64_c, 0),
+                      make_tuple(5, 5, variance32x32_c, 0),
+                      make_tuple(5, 4, variance32x16_c, 0),
+                      make_tuple(4, 5, variance16x32_c, 0),
+                      make_tuple(4, 4, variance16x16_c, 0),
+                      make_tuple(4, 3, variance16x8_c, 0),
+                      make_tuple(3, 4, variance8x16_c, 0),
+                      make_tuple(3, 3, variance8x8_c, 0),
+                      make_tuple(3, 2, variance8x4_c, 0),
+                      make_tuple(2, 3, variance4x8_c, 0),
+                      make_tuple(2, 2, variance4x4_c, 0)));
 
-#if HAVE_NEON
-const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon;
-const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon;
-const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon;
-const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
+const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c;
 INSTANTIATE_TEST_CASE_P(
-    NEON, VP8VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
-                      make_tuple(3, 4, variance8x16_neon),
-                      make_tuple(4, 3, variance16x8_neon),
-                      make_tuple(4, 4, variance16x16_neon)));
-#endif
+    C, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0),
+                      make_tuple(6, 5, subpel_var64x32_c, 0),
+                      make_tuple(5, 6, subpel_var32x64_c, 0),
+                      make_tuple(5, 5, subpel_var32x32_c, 0),
+                      make_tuple(5, 4, subpel_var32x16_c, 0),
+                      make_tuple(4, 5, subpel_var16x32_c, 0),
+                      make_tuple(4, 4, subpel_var16x16_c, 0),
+                      make_tuple(4, 3, subpel_var16x8_c, 0),
+                      make_tuple(3, 4, subpel_var8x16_c, 0),
+                      make_tuple(3, 3, subpel_var8x8_c, 0),
+                      make_tuple(3, 2, subpel_var8x4_c, 0),
+                      make_tuple(2, 3, subpel_var4x8_c, 0),
+                      make_tuple(2, 2, subpel_var4x4_c, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_var64x64_c =
+    vpx_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var64x32_c =
+    vpx_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x64_c =
+    vpx_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x32_c =
+    vpx_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x16_c =
+    vpx_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x32_c =
+    vpx_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x16_c =
+    vpx_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x8_c =
+    vpx_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x16_c =
+    vpx_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0),
+                      make_tuple(6, 5, subpel_avg_var64x32_c, 0),
+                      make_tuple(5, 6, subpel_avg_var32x64_c, 0),
+                      make_tuple(5, 5, subpel_avg_var32x32_c, 0),
+                      make_tuple(5, 4, subpel_avg_var32x16_c, 0),
+                      make_tuple(4, 5, subpel_avg_var16x32_c, 0),
+                      make_tuple(4, 4, subpel_avg_var16x16_c, 0),
+                      make_tuple(4, 3, subpel_avg_var16x8_c, 0),
+                      make_tuple(3, 4, subpel_avg_var8x16_c, 0),
+                      make_tuple(3, 3, subpel_avg_var8x8_c, 0),
+                      make_tuple(3, 2, subpel_avg_var8x4_c, 0),
+                      make_tuple(2, 3, subpel_avg_var4x8_c, 0),
+                      make_tuple(2, 2, subpel_avg_var4x4_c, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc>
+    VpxHBDSubpelAvgVarianceTest;
+
+TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
+const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c;
+const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c;
+const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c;
+
+const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c;
+const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c;
+const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c;
+const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c;
+
+const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c;
+const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c;
+const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c;
+const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c),
+                                        make_tuple(4, 4, highbd_12_mse16x8_c),
+                                        make_tuple(4, 4, highbd_12_mse8x16_c),
+                                        make_tuple(4, 4, highbd_12_mse8x8_c),
+                                        make_tuple(4, 4, highbd_10_mse16x16_c),
+                                        make_tuple(4, 4, highbd_10_mse16x8_c),
+                                        make_tuple(4, 4, highbd_10_mse8x16_c),
+                                        make_tuple(4, 4, highbd_10_mse8x8_c),
+                                        make_tuple(4, 4, highbd_8_mse16x16_c),
+                                        make_tuple(4, 4, highbd_8_mse16x8_c),
+                                        make_tuple(4, 4, highbd_8_mse8x16_c),
+                                        make_tuple(4, 4, highbd_8_mse8x8_c)));
+*/
+
+const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c;
+const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c;
+const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c;
+const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c;
+const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c;
+const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c;
+const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c;
+const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c;
+const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c;
+const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c;
+const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
+const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
+const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
+const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
+const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
+const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
+const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c;
+const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c;
+const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c;
+const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c;
+const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c;
+const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c;
+const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c;
+const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
+const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
+const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
+const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
+const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
+const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
+const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c;
+const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c;
+const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c;
+const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c;
+const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c;
+const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c;
+const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c;
+const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c;
+const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c;
+const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),
+                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),
+                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),
+                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),
+                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),
+                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),
+                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),
+                      make_tuple(2, 2, highbd_10_variance4x4_c, 10),
+                      make_tuple(6, 6, highbd_8_variance64x64_c, 8),
+                      make_tuple(6, 5, highbd_8_variance64x32_c, 8),
+                      make_tuple(5, 6, highbd_8_variance32x64_c, 8),
+                      make_tuple(5, 5, highbd_8_variance32x32_c, 8),
+                      make_tuple(5, 4, highbd_8_variance32x16_c, 8),
+                      make_tuple(4, 5, highbd_8_variance16x32_c, 8),
+                      make_tuple(4, 4, highbd_8_variance16x16_c, 8),
+                      make_tuple(4, 3, highbd_8_variance16x8_c, 8),
+                      make_tuple(3, 4, highbd_8_variance8x16_c, 8),
+                      make_tuple(3, 3, highbd_8_variance8x8_c, 8),
+                      make_tuple(3, 2, highbd_8_variance8x4_c, 8),
+                      make_tuple(2, 3, highbd_8_variance4x8_c, 8),
+                      make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
+
+const SubpixVarMxNFunc highbd_8_subpel_var64x64_c =
+    vpx_highbd_8_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var64x32_c =
+    vpx_highbd_8_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x64_c =
+    vpx_highbd_8_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x32_c =
+    vpx_highbd_8_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x16_c =
+    vpx_highbd_8_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x32_c =
+    vpx_highbd_8_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x16_c =
+    vpx_highbd_8_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x8_c =
+    vpx_highbd_8_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x16_c =
+    vpx_highbd_8_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x8_c =
+    vpx_highbd_8_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x4_c =
+    vpx_highbd_8_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x8_c =
+    vpx_highbd_8_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x4_c =
+    vpx_highbd_8_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x64_c =
+    vpx_highbd_10_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x32_c =
+    vpx_highbd_10_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x64_c =
+    vpx_highbd_10_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x32_c =
+    vpx_highbd_10_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x16_c =
+    vpx_highbd_10_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x32_c =
+    vpx_highbd_10_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x16_c =
+    vpx_highbd_10_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x8_c =
+    vpx_highbd_10_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x16_c =
+    vpx_highbd_10_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x8_c =
+    vpx_highbd_10_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x4_c =
+    vpx_highbd_10_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x8_c =
+    vpx_highbd_10_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x4_c =
+    vpx_highbd_10_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x64_c =
+    vpx_highbd_12_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x32_c =
+    vpx_highbd_12_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x64_c =
+    vpx_highbd_12_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x32_c =
+    vpx_highbd_12_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x16_c =
+    vpx_highbd_12_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x32_c =
+    vpx_highbd_12_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x16_c =
+    vpx_highbd_12_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x8_c =
+    vpx_highbd_12_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x16_c =
+    vpx_highbd_12_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x8_c =
+    vpx_highbd_12_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x4_c =
+    vpx_highbd_12_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x8_c =
+    vpx_highbd_12_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x4_c =
+    vpx_highbd_12_sub_pixel_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8),
+                      make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8),
+                      make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8),
+                      make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8),
+                      make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8),
+                      make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8),
+                      make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8),
+                      make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8),
+                      make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8),
+                      make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8),
+                      make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8),
+                      make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8),
+                      make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8),
+                      make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10),
+                      make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10),
+                      make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10),
+                      make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10),
+                      make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10),
+                      make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10),
+                      make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10),
+                      make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10),
+                      make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10),
+                      make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10),
+                      make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10),
+                      make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10),
+                      make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10),
+                      make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12),
+                      make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12),
+                      make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12),
+                      make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12),
+                      make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12),
+                      make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12),
+                      make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12),
+                      make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12),
+                      make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12),
+                      make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12),
+                      make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12),
+                      make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12),
+                      make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12)));
+
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c =
+    vpx_highbd_8_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c =
+    vpx_highbd_8_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c =
+    vpx_highbd_10_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c =
+    vpx_highbd_10_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c =
+    vpx_highbd_12_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c =
+    vpx_highbd_12_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8),
+        make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8),
+        make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8),
+        make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8),
+        make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8),
+        make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8),
+        make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8),
+        make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8),
+        make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8),
+        make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8),
+        make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8),
+        make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8),
+        make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8),
+        make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10),
+        make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10),
+        make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10),
+        make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10),
+        make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10),
+        make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10),
+        make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10),
+        make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10),
+        make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10),
+        make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10),
+        make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10),
+        make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10),
+        make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10),
+        make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12),
+        make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12),
+        make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12),
+        make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12),
+        make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12),
+        make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12),
+        make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12),
+        make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12),
+        make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12),
+        make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12),
+        make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12),
+        make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12),
+        make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_MMX
-const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
-const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
-const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx;
-const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
-const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
+const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx;
+INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_mmx)));
+
+INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_mmx));
+
+const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx;
+const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx;
+const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx;
+const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx;
+const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx;
 INSTANTIATE_TEST_CASE_P(
-    MMX, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
-                      make_tuple(3, 3, variance8x8_mmx),
-                      make_tuple(3, 4, variance8x16_mmx),
-                      make_tuple(4, 3, variance16x8_mmx),
-                      make_tuple(4, 4, variance16x16_mmx)));
-#endif
+    MMX, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0),
+                      make_tuple(4, 3, variance16x8_mmx, 0),
+                      make_tuple(3, 4, variance8x16_mmx, 0),
+                      make_tuple(3, 3, variance8x8_mmx, 0),
+                      make_tuple(2, 2, variance4x4_mmx, 0)));
+
+const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx;
+const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx;
+const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx;
+const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx;
+const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0),
+                      make_tuple(4, 3, subpel_var16x8_mmx, 0),
+                      make_tuple(3, 4, subpel_var8x16_mmx, 0),
+                      make_tuple(3, 3, subpel_var8x8_mmx, 0),
+                      make_tuple(2, 2, subpel_var4x4_mmx, 0)));
+#endif  // HAVE_MMX
 
 #if HAVE_SSE2
-const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt;
-const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt;
-const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt;
-const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
-const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_sse2));
+
+const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2;
+const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2;
+const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2;
+const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2;
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_sse2),
+                                          make_tuple(4, 3, mse16x8_sse2),
+                                          make_tuple(3, 4, mse8x16_sse2),
+                                          make_tuple(3, 3, mse8x8_sse2)));
+
+const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2;
+const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2;
+const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2;
+const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2;
+const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2;
+const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2;
+const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2;
+const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2;
+const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2;
+const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2;
+const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2;
+const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2;
+const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
-                      make_tuple(3, 3, variance8x8_wmt),
-                      make_tuple(3, 4, variance8x16_wmt),
-                      make_tuple(4, 3, variance16x8_wmt),
-                      make_tuple(4, 4, variance16x16_wmt)));
-#endif
-#endif  // CONFIG_VP8_ENCODER
+    SSE2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0),
+                      make_tuple(6, 5, variance64x32_sse2, 0),
+                      make_tuple(5, 6, variance32x64_sse2, 0),
+                      make_tuple(5, 5, variance32x32_sse2, 0),
+                      make_tuple(5, 4, variance32x16_sse2, 0),
+                      make_tuple(4, 5, variance16x32_sse2, 0),
+                      make_tuple(4, 4, variance16x16_sse2, 0),
+                      make_tuple(4, 3, variance16x8_sse2, 0),
+                      make_tuple(3, 4, variance8x16_sse2, 0),
+                      make_tuple(3, 3, variance8x8_sse2, 0),
+                      make_tuple(3, 2, variance8x4_sse2, 0),
+                      make_tuple(2, 3, variance4x8_sse2, 0),
+                      make_tuple(2, 2, variance4x4_sse2, 0)));
 
-}  // namespace vp8
-
-// -----------------------------------------------------------------------------
-// VP9 test cases.
-
-namespace vp9 {
-
-#if CONFIG_VP9_ENCODER
-typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
-typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
-
-TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP9VarianceTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
-
-const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
-const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
-const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
-const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
-const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
-const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
-const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
-const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
-const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
-const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
-const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
-const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
-const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c),
-                      make_tuple(2, 3, variance4x8_c),
-                      make_tuple(3, 2, variance8x4_c),
-                      make_tuple(3, 3, variance8x8_c),
-                      make_tuple(3, 4, variance8x16_c),
-                      make_tuple(4, 3, variance16x8_c),
-                      make_tuple(4, 4, variance16x16_c),
-                      make_tuple(4, 5, variance16x32_c),
-                      make_tuple(5, 4, variance32x16_c),
-                      make_tuple(5, 5, variance32x32_c),
-                      make_tuple(5, 6, variance32x64_c),
-                      make_tuple(6, 5, variance64x32_c),
-                      make_tuple(6, 6, variance64x64_c)));
-
-const vp9_subpixvariance_fn_t subpel_variance4x4_c =
-    vp9_sub_pixel_variance4x4_c;
-const vp9_subpixvariance_fn_t subpel_variance4x8_c =
-    vp9_sub_pixel_variance4x8_c;
-const vp9_subpixvariance_fn_t subpel_variance8x4_c =
-    vp9_sub_pixel_variance8x4_c;
-const vp9_subpixvariance_fn_t subpel_variance8x8_c =
-    vp9_sub_pixel_variance8x8_c;
-const vp9_subpixvariance_fn_t subpel_variance8x16_c =
-    vp9_sub_pixel_variance8x16_c;
-const vp9_subpixvariance_fn_t subpel_variance16x8_c =
-    vp9_sub_pixel_variance16x8_c;
-const vp9_subpixvariance_fn_t subpel_variance16x16_c =
-    vp9_sub_pixel_variance16x16_c;
-const vp9_subpixvariance_fn_t subpel_variance16x32_c =
-    vp9_sub_pixel_variance16x32_c;
-const vp9_subpixvariance_fn_t subpel_variance32x16_c =
-    vp9_sub_pixel_variance32x16_c;
-const vp9_subpixvariance_fn_t subpel_variance32x32_c =
-    vp9_sub_pixel_variance32x32_c;
-const vp9_subpixvariance_fn_t subpel_variance32x64_c =
-    vp9_sub_pixel_variance32x64_c;
-const vp9_subpixvariance_fn_t subpel_variance64x32_c =
-    vp9_sub_pixel_variance64x32_c;
-const vp9_subpixvariance_fn_t subpel_variance64x64_c =
-    vp9_sub_pixel_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
-                      make_tuple(2, 3, subpel_variance4x8_c),
-                      make_tuple(3, 2, subpel_variance8x4_c),
-                      make_tuple(3, 3, subpel_variance8x8_c),
-                      make_tuple(3, 4, subpel_variance8x16_c),
-                      make_tuple(4, 3, subpel_variance16x8_c),
-                      make_tuple(4, 4, subpel_variance16x16_c),
-                      make_tuple(4, 5, subpel_variance16x32_c),
-                      make_tuple(5, 4, subpel_variance32x16_c),
-                      make_tuple(5, 5, subpel_variance32x32_c),
-                      make_tuple(5, 6, subpel_variance32x64_c),
-                      make_tuple(6, 5, subpel_variance64x32_c),
-                      make_tuple(6, 6, subpel_variance64x64_c)));
-
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
-    vp9_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
-    vp9_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
-    vp9_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
-    vp9_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
-    vp9_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
-    vp9_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
-    vp9_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
-    vp9_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
-    vp9_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
-    vp9_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
-    vp9_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
-    vp9_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
-    vp9_sub_pixel_avg_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
-                      make_tuple(2, 3, subpel_avg_variance4x8_c),
-                      make_tuple(3, 2, subpel_avg_variance8x4_c),
-                      make_tuple(3, 3, subpel_avg_variance8x8_c),
-                      make_tuple(3, 4, subpel_avg_variance8x16_c),
-                      make_tuple(4, 3, subpel_avg_variance16x8_c),
-                      make_tuple(4, 4, subpel_avg_variance16x16_c),
-                      make_tuple(4, 5, subpel_avg_variance16x32_c),
-                      make_tuple(5, 4, subpel_avg_variance32x16_c),
-                      make_tuple(5, 5, subpel_avg_variance32x32_c),
-                      make_tuple(5, 6, subpel_avg_variance32x64_c),
-                      make_tuple(6, 5, subpel_avg_variance64x32_c),
-                      make_tuple(6, 6, subpel_avg_variance64x64_c)));
-
-#if HAVE_MMX
-const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
-const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
-const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;
-const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
-const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
-                      make_tuple(3, 3, variance8x8_mmx),
-                      make_tuple(3, 4, variance8x16_mmx),
-                      make_tuple(4, 3, variance16x8_mmx),
-                      make_tuple(4, 4, variance16x16_mmx)));
-#endif
-
-#if HAVE_SSE2
 #if CONFIG_USE_X86INC
-const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
-const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
-const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
-const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
-const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
-const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
-const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
-const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
-const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
+const SubpixVarMxNFunc subpel_variance64x64_sse2 =
+    vpx_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc subpel_variance64x32_sse2 =
+    vpx_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x64_sse2 =
+    vpx_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc subpel_variance32x32_sse2 =
+    vpx_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x16_sse2 =
+    vpx_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x32_sse2 =
+    vpx_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc subpel_variance16x16_sse2 =
+    vpx_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x8_sse2 =
+    vpx_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x16_sse2 =
+    vpx_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse;
+const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
-                      make_tuple(2, 3, variance4x8_sse2),
-                      make_tuple(3, 2, variance8x4_sse2),
-                      make_tuple(3, 3, variance8x8_sse2),
-                      make_tuple(3, 4, variance8x16_sse2),
-                      make_tuple(4, 3, variance16x8_sse2),
-                      make_tuple(4, 4, variance16x16_sse2),
-                      make_tuple(4, 5, variance16x32_sse2),
-                      make_tuple(5, 4, variance32x16_sse2),
-                      make_tuple(5, 5, variance32x32_sse2),
-                      make_tuple(5, 6, variance32x64_sse2),
-                      make_tuple(6, 5, variance64x32_sse2),
-                      make_tuple(6, 6, variance64x64_sse2)));
+    SSE2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0),
+                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),
+                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),
+                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),
+                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),
+                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),
+                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),
+                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),
+                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),
+                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),
+                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),
+                      make_tuple(2, 3, subpel_variance4x8_sse, 0),
+                      make_tuple(2, 2, subpel_variance4x4_sse, 0)));
 
-const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
-    vp9_sub_pixel_variance4x4_sse;
-const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
-    vp9_sub_pixel_variance4x8_sse;
-const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
-    vp9_sub_pixel_variance8x4_sse2;
-const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
-    vp9_sub_pixel_variance8x8_sse2;
-const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
-    vp9_sub_pixel_variance8x16_sse2;
-const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
-    vp9_sub_pixel_variance16x8_sse2;
-const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
-    vp9_sub_pixel_variance16x16_sse2;
-const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
-    vp9_sub_pixel_variance16x32_sse2;
-const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
-    vp9_sub_pixel_variance32x16_sse2;
-const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
-    vp9_sub_pixel_variance32x32_sse2;
-const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
-    vp9_sub_pixel_variance32x64_sse2;
-const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
-    vp9_sub_pixel_variance64x32_sse2;
-const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
-    vp9_sub_pixel_variance64x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 =
+    vpx_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 =
+    vpx_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 =
+    vpx_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 =
+    vpx_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 =
+    vpx_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 =
+    vpx_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 =
+    vpx_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 =
+    vpx_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 =
+    vpx_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 =
+    vpx_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 =
+    vpx_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse =
+    vpx_sub_pixel_avg_variance4x8_sse;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse =
+    vpx_sub_pixel_avg_variance4x4_sse;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
-                      make_tuple(2, 3, subpel_variance4x8_sse),
-                      make_tuple(3, 2, subpel_variance8x4_sse2),
-                      make_tuple(3, 3, subpel_variance8x8_sse2),
-                      make_tuple(3, 4, subpel_variance8x16_sse2),
-                      make_tuple(4, 3, subpel_variance16x8_sse2),
-                      make_tuple(4, 4, subpel_variance16x16_sse2),
-                      make_tuple(4, 5, subpel_variance16x32_sse2),
-                      make_tuple(5, 4, subpel_variance32x16_sse2),
-                      make_tuple(5, 5, subpel_variance32x32_sse2),
-                      make_tuple(5, 6, subpel_variance32x64_sse2),
-                      make_tuple(6, 5, subpel_variance64x32_sse2),
-                      make_tuple(6, 6, subpel_variance64x64_sse2)));
+    SSE2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_sse, 0)));
+#endif  // CONFIG_USE_X86INC
 
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
-    vp9_sub_pixel_avg_variance4x4_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
-    vp9_sub_pixel_avg_variance4x8_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
-    vp9_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
-    vp9_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
-    vp9_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
-    vp9_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
-    vp9_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
-    vp9_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
-    vp9_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
-    vp9_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
-    vp9_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
-    vp9_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
-    vp9_sub_pixel_avg_variance64x64_sse2;
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
+const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2;
+const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2;
+const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2;
+const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2;
+const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2;
+const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2;
+const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2;
+const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2;
+const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
-                      make_tuple(2, 3, subpel_avg_variance4x8_sse),
-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2),
-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2),
-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2),
-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2),
-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2),
-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2),
-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2),
-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2),
-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2),
-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2),
-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
-#endif
-#endif
+    SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_12_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_12_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_12_mse8x8_sse2),
+                                           make_tuple(4, 4, highbd_10_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_10_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_10_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_10_mse8x8_sse2),
+                                           make_tuple(4, 4, highbd_8_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_8_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_8_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_8_mse8x8_sse2)));
+*/
+
+const VarianceMxNFunc highbd_12_variance64x64_sse2 =
+    vpx_highbd_12_variance64x64_sse2;
+const VarianceMxNFunc highbd_12_variance64x32_sse2 =
+    vpx_highbd_12_variance64x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x64_sse2 =
+    vpx_highbd_12_variance32x64_sse2;
+const VarianceMxNFunc highbd_12_variance32x32_sse2 =
+    vpx_highbd_12_variance32x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x16_sse2 =
+    vpx_highbd_12_variance32x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x32_sse2 =
+    vpx_highbd_12_variance16x32_sse2;
+const VarianceMxNFunc highbd_12_variance16x16_sse2 =
+    vpx_highbd_12_variance16x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x8_sse2 =
+    vpx_highbd_12_variance16x8_sse2;
+const VarianceMxNFunc highbd_12_variance8x16_sse2 =
+    vpx_highbd_12_variance8x16_sse2;
+const VarianceMxNFunc highbd_12_variance8x8_sse2 =
+    vpx_highbd_12_variance8x8_sse2;
+const VarianceMxNFunc highbd_10_variance64x64_sse2 =
+    vpx_highbd_10_variance64x64_sse2;
+const VarianceMxNFunc highbd_10_variance64x32_sse2 =
+    vpx_highbd_10_variance64x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x64_sse2 =
+    vpx_highbd_10_variance32x64_sse2;
+const VarianceMxNFunc highbd_10_variance32x32_sse2 =
+    vpx_highbd_10_variance32x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x16_sse2 =
+    vpx_highbd_10_variance32x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x32_sse2 =
+    vpx_highbd_10_variance16x32_sse2;
+const VarianceMxNFunc highbd_10_variance16x16_sse2 =
+    vpx_highbd_10_variance16x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x8_sse2 =
+    vpx_highbd_10_variance16x8_sse2;
+const VarianceMxNFunc highbd_10_variance8x16_sse2 =
+    vpx_highbd_10_variance8x16_sse2;
+const VarianceMxNFunc highbd_10_variance8x8_sse2 =
+    vpx_highbd_10_variance8x8_sse2;
+const VarianceMxNFunc highbd_8_variance64x64_sse2 =
+    vpx_highbd_8_variance64x64_sse2;
+const VarianceMxNFunc highbd_8_variance64x32_sse2 =
+    vpx_highbd_8_variance64x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x64_sse2 =
+    vpx_highbd_8_variance32x64_sse2;
+const VarianceMxNFunc highbd_8_variance32x32_sse2 =
+    vpx_highbd_8_variance32x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x16_sse2 =
+    vpx_highbd_8_variance32x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x32_sse2 =
+    vpx_highbd_8_variance16x32_sse2;
+const VarianceMxNFunc highbd_8_variance16x16_sse2 =
+    vpx_highbd_8_variance16x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x8_sse2 =
+    vpx_highbd_8_variance16x8_sse2;
+const VarianceMxNFunc highbd_8_variance8x16_sse2 =
+    vpx_highbd_8_variance8x16_sse2;
+const VarianceMxNFunc highbd_8_variance8x8_sse2 =
+    vpx_highbd_8_variance8x8_sse2;
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
+                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
+                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
+                      make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
+                      make_tuple(6, 6, highbd_8_variance64x64_sse2, 8),
+                      make_tuple(6, 5, highbd_8_variance64x32_sse2, 8),
+                      make_tuple(5, 6, highbd_8_variance32x64_sse2, 8),
+                      make_tuple(5, 5, highbd_8_variance32x32_sse2, 8),
+                      make_tuple(5, 4, highbd_8_variance32x16_sse2, 8),
+                      make_tuple(4, 5, highbd_8_variance16x32_sse2, 8),
+                      make_tuple(4, 4, highbd_8_variance16x16_sse2, 8),
+                      make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
+                      make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
+                      make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
+
+#if CONFIG_USE_X86INC
+const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 =
+    vpx_highbd_12_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
+    vpx_highbd_10_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 =
+    vpx_highbd_8_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x4_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
+                      make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
+                      make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
+                      make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8),
+                      make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8),
+                      make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8),
+                      make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8),
+                      make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8),
+                      make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8),
+                      make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8),
+                      make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8),
+                      make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8),
+                      make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8),
+                      make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8)));
+
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x4_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
+        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
+        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
+        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
+        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
+        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
+        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
+        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
+        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
+        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
+        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
+        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
+        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
+        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
+        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
+        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
+        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
+        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
+        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
+        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
+        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
+        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
+        make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8),
+        make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8),
+        make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8),
+        make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8),
+        make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8),
+        make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8),
+        make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8),
+        make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8),
+        make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8),
+        make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8),
+        make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8)));
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
-
-const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
-    vp9_sub_pixel_variance4x4_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
-    vp9_sub_pixel_variance4x8_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
-    vp9_sub_pixel_variance8x4_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
-    vp9_sub_pixel_variance8x8_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
-    vp9_sub_pixel_variance8x16_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
-    vp9_sub_pixel_variance16x8_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
-    vp9_sub_pixel_variance16x16_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
-    vp9_sub_pixel_variance16x32_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
-    vp9_sub_pixel_variance32x16_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
-    vp9_sub_pixel_variance32x32_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
-    vp9_sub_pixel_variance32x64_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
-    vp9_sub_pixel_variance64x32_ssse3;
-const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
-    vp9_sub_pixel_variance64x64_ssse3;
+const SubpixVarMxNFunc subpel_variance64x64_ssse3 =
+    vpx_sub_pixel_variance64x64_ssse3;
+const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
+    vpx_sub_pixel_variance64x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
+    vpx_sub_pixel_variance32x64_ssse3;
+const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
+    vpx_sub_pixel_variance32x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
+    vpx_sub_pixel_variance32x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
+    vpx_sub_pixel_variance16x32_ssse3;
+const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
+    vpx_sub_pixel_variance16x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
+    vpx_sub_pixel_variance16x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
+    vpx_sub_pixel_variance8x16_ssse3;
+const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
+    vpx_sub_pixel_variance8x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
+    vpx_sub_pixel_variance8x4_ssse3;
+const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
+    vpx_sub_pixel_variance4x8_ssse3;
+const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
+    vpx_sub_pixel_variance4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
-                      make_tuple(2, 3, subpel_variance4x8_ssse3),
-                      make_tuple(3, 2, subpel_variance8x4_ssse3),
-                      make_tuple(3, 3, subpel_variance8x8_ssse3),
-                      make_tuple(3, 4, subpel_variance8x16_ssse3),
-                      make_tuple(4, 3, subpel_variance16x8_ssse3),
-                      make_tuple(4, 4, subpel_variance16x16_ssse3),
-                      make_tuple(4, 5, subpel_variance16x32_ssse3),
-                      make_tuple(5, 4, subpel_variance32x16_ssse3),
-                      make_tuple(5, 5, subpel_variance32x32_ssse3),
-                      make_tuple(5, 6, subpel_variance32x64_ssse3),
-                      make_tuple(6, 5, subpel_variance64x32_ssse3),
-                      make_tuple(6, 6, subpel_variance64x64_ssse3)));
+    SSSE3, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0),
+                      make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
+                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, subpel_variance4x4_ssse3, 0)));
 
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
-    vp9_sub_pixel_avg_variance4x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
-    vp9_sub_pixel_avg_variance4x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
-    vp9_sub_pixel_avg_variance8x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
-    vp9_sub_pixel_avg_variance8x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
-    vp9_sub_pixel_avg_variance8x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
-    vp9_sub_pixel_avg_variance16x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
-    vp9_sub_pixel_avg_variance16x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
-    vp9_sub_pixel_avg_variance16x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
-    vp9_sub_pixel_avg_variance32x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
-    vp9_sub_pixel_avg_variance32x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
-    vp9_sub_pixel_avg_variance32x64_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
-    vp9_sub_pixel_avg_variance64x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
-    vp9_sub_pixel_avg_variance64x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 =
+    vpx_sub_pixel_avg_variance64x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 =
+    vpx_sub_pixel_avg_variance64x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 =
+    vpx_sub_pixel_avg_variance32x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 =
+    vpx_sub_pixel_avg_variance32x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 =
+    vpx_sub_pixel_avg_variance32x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 =
+    vpx_sub_pixel_avg_variance16x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 =
+    vpx_sub_pixel_avg_variance16x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 =
+    vpx_sub_pixel_avg_variance16x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 =
+    vpx_sub_pixel_avg_variance8x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 =
+    vpx_sub_pixel_avg_variance8x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 =
+    vpx_sub_pixel_avg_variance8x4_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 =
+    vpx_sub_pixel_avg_variance4x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 =
+    vpx_sub_pixel_avg_variance4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
-                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
-                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
-#endif
-#endif
+    SSSE3, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0)));
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
+const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
+INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_avx2)));
 
-const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
-const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
-const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
-const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
-const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2;
+const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2;
+const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2;
+const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2;
+const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
-                      make_tuple(5, 4, variance32x16_avx2),
-                      make_tuple(5, 5, variance32x32_avx2),
-                      make_tuple(6, 5, variance64x32_avx2),
-                      make_tuple(6, 6, variance64x64_avx2)));
+    AVX2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0),
+                      make_tuple(6, 5, variance64x32_avx2, 0),
+                      make_tuple(5, 5, variance32x32_avx2, 0),
+                      make_tuple(5, 4, variance32x16_avx2, 0),
+                      make_tuple(4, 4, variance16x16_avx2, 0)));
 
-const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
-    vp9_sub_pixel_variance32x32_avx2;
-const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
-    vp9_sub_pixel_variance64x64_avx2;
+const SubpixVarMxNFunc subpel_variance64x64_avx2 =
+    vpx_sub_pixel_variance64x64_avx2;
+const SubpixVarMxNFunc subpel_variance32x32_avx2 =
+    vpx_sub_pixel_variance32x32_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
-                      make_tuple(6, 6, subpel_variance64x64_avx2)));
+    AVX2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0),
+                      make_tuple(5, 5, subpel_variance32x32_avx2, 0)));
 
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
-    vp9_sub_pixel_avg_variance32x32_avx2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
-    vp9_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 =
+    vpx_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 =
+    vpx_sub_pixel_avg_variance32x32_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
-                      make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+    AVX2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0)));
 #endif  // HAVE_AVX2
+
+#if HAVE_MEDIA
+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+
+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
+                      make_tuple(3, 3, variance8x8_media, 0)));
+
+const SubpixVarMxNFunc subpel_variance16x16_media =
+    vpx_sub_pixel_variance16x16_media;
+const SubpixVarMxNFunc subpel_variance8x8_media =
+    vpx_sub_pixel_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0),
+                      make_tuple(3, 3, subpel_variance8x8_media, 0)));
+#endif  // HAVE_MEDIA
+
 #if HAVE_NEON
-const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
-const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
-const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP9VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
-                      make_tuple(4, 4, variance16x16_neon),
-                      make_tuple(5, 5, variance32x32_neon)));
+const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
 
-const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
-    vp9_sub_pixel_variance8x8_neon;
-const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
-    vp9_sub_pixel_variance16x16_neon;
-const vp9_subpixvariance_fn_t subpel_variance32x32_neon =
-    vp9_sub_pixel_variance32x32_neon;
+const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
+
+const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon;
+const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon;
+const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon;
+const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon;
+const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon;
+const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon;
+const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon;
+const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon;
 INSTANTIATE_TEST_CASE_P(
-    NEON, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon),
-                      make_tuple(4, 4, subpel_variance16x16_neon),
-                      make_tuple(5, 5, subpel_variance32x32_neon)));
+    NEON, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0),
+                      make_tuple(6, 5, variance64x32_neon, 0),
+                      make_tuple(5, 6, variance32x64_neon, 0),
+                      make_tuple(5, 5, variance32x32_neon, 0),
+                      make_tuple(4, 4, variance16x16_neon, 0),
+                      make_tuple(4, 3, variance16x8_neon, 0),
+                      make_tuple(3, 4, variance8x16_neon, 0),
+                      make_tuple(3, 3, variance8x8_neon, 0)));
+
+const SubpixVarMxNFunc subpel_variance64x64_neon =
+    vpx_sub_pixel_variance64x64_neon;
+const SubpixVarMxNFunc subpel_variance32x32_neon =
+    vpx_sub_pixel_variance32x32_neon;
+const SubpixVarMxNFunc subpel_variance16x16_neon =
+    vpx_sub_pixel_variance16x16_neon;
+const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0),
+                      make_tuple(5, 5, subpel_variance32x32_neon, 0),
+                      make_tuple(4, 4, subpel_variance16x16_neon, 0),
+                      make_tuple(3, 3, subpel_variance8x8_neon, 0)));
 #endif  // HAVE_NEON
-#endif  // CONFIG_VP9_ENCODER
 
-}  // namespace vp9
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_msa));
 
+const Get4x4SseFunc get4x4sse_cs_msa = vpx_get4x4sse_cs_msa;
+INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_msa)));
+
+const VarianceMxNFunc mse16x16_msa = vpx_mse16x16_msa;
+const VarianceMxNFunc mse16x8_msa = vpx_mse16x8_msa;
+const VarianceMxNFunc mse8x16_msa = vpx_mse8x16_msa;
+const VarianceMxNFunc mse8x8_msa = vpx_mse8x8_msa;
+INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_msa),
+                                          make_tuple(4, 3, mse16x8_msa),
+                                          make_tuple(3, 4, mse8x16_msa),
+                                          make_tuple(3, 3, mse8x8_msa)));
+
+const VarianceMxNFunc variance64x64_msa = vpx_variance64x64_msa;
+const VarianceMxNFunc variance64x32_msa = vpx_variance64x32_msa;
+const VarianceMxNFunc variance32x64_msa = vpx_variance32x64_msa;
+const VarianceMxNFunc variance32x32_msa = vpx_variance32x32_msa;
+const VarianceMxNFunc variance32x16_msa = vpx_variance32x16_msa;
+const VarianceMxNFunc variance16x32_msa = vpx_variance16x32_msa;
+const VarianceMxNFunc variance16x16_msa = vpx_variance16x16_msa;
+const VarianceMxNFunc variance16x8_msa = vpx_variance16x8_msa;
+const VarianceMxNFunc variance8x16_msa = vpx_variance8x16_msa;
+const VarianceMxNFunc variance8x8_msa = vpx_variance8x8_msa;
+const VarianceMxNFunc variance8x4_msa = vpx_variance8x4_msa;
+const VarianceMxNFunc variance4x8_msa = vpx_variance4x8_msa;
+const VarianceMxNFunc variance4x4_msa = vpx_variance4x4_msa;
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_msa, 0),
+                      make_tuple(6, 5, variance64x32_msa, 0),
+                      make_tuple(5, 6, variance32x64_msa, 0),
+                      make_tuple(5, 5, variance32x32_msa, 0),
+                      make_tuple(5, 4, variance32x16_msa, 0),
+                      make_tuple(4, 5, variance16x32_msa, 0),
+                      make_tuple(4, 4, variance16x16_msa, 0),
+                      make_tuple(4, 3, variance16x8_msa, 0),
+                      make_tuple(3, 4, variance8x16_msa, 0),
+                      make_tuple(3, 3, variance8x8_msa, 0),
+                      make_tuple(3, 2, variance8x4_msa, 0),
+                      make_tuple(2, 3, variance4x8_msa, 0),
+                      make_tuple(2, 2, variance4x4_msa, 0)));
+
+const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa;
+const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa;
+const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa;
+const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa;
+const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa;
+const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa;
+const SubpixVarMxNFunc subpel_variance16x16_msa =
+    vpx_sub_pixel_variance16x16_msa;
+const SubpixVarMxNFunc subpel_variance16x32_msa =
+    vpx_sub_pixel_variance16x32_msa;
+const SubpixVarMxNFunc subpel_variance32x16_msa =
+    vpx_sub_pixel_variance32x16_msa;
+const SubpixVarMxNFunc subpel_variance32x32_msa =
+    vpx_sub_pixel_variance32x32_msa;
+const SubpixVarMxNFunc subpel_variance32x64_msa =
+    vpx_sub_pixel_variance32x64_msa;
+const SubpixVarMxNFunc subpel_variance64x32_msa =
+    vpx_sub_pixel_variance64x32_msa;
+const SubpixVarMxNFunc subpel_variance64x64_msa =
+    vpx_sub_pixel_variance64x64_msa;
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),
+                      make_tuple(2, 3, subpel_variance4x8_msa, 0),
+                      make_tuple(3, 2, subpel_variance8x4_msa, 0),
+                      make_tuple(3, 3, subpel_variance8x8_msa, 0),
+                      make_tuple(3, 4, subpel_variance8x16_msa, 0),
+                      make_tuple(4, 3, subpel_variance16x8_msa, 0),
+                      make_tuple(4, 4, subpel_variance16x16_msa, 0),
+                      make_tuple(4, 5, subpel_variance16x32_msa, 0),
+                      make_tuple(5, 4, subpel_variance32x16_msa, 0),
+                      make_tuple(5, 5, subpel_variance32x32_msa, 0),
+                      make_tuple(5, 6, subpel_variance32x64_msa, 0),
+                      make_tuple(6, 5, subpel_variance64x32_msa, 0),
+                      make_tuple(6, 6, subpel_variance64x64_msa, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_msa =
+    vpx_sub_pixel_avg_variance64x64_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_msa =
+    vpx_sub_pixel_avg_variance64x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_msa =
+    vpx_sub_pixel_avg_variance32x64_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_msa =
+    vpx_sub_pixel_avg_variance32x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_msa =
+    vpx_sub_pixel_avg_variance32x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_msa =
+    vpx_sub_pixel_avg_variance16x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_msa =
+    vpx_sub_pixel_avg_variance16x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_msa =
+    vpx_sub_pixel_avg_variance16x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_msa =
+    vpx_sub_pixel_avg_variance8x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_msa =
+    vpx_sub_pixel_avg_variance8x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_msa =
+    vpx_sub_pixel_avg_variance8x4_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_msa =
+    vpx_sub_pixel_avg_variance4x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_msa =
+    vpx_sub_pixel_avg_variance4x4_msa;
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_msa, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_msa, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_msa, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_msa, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_msa, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_msa, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_msa, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_msa, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_msa, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_msa, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_msa, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_msa, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_msa, 0)));
+#endif  // HAVE_MSA
 }  // namespace

diff --git a/libvpx/test/video_source.h b/libvpx/test/video_source.h
index c924f96..63294d1 100644
--- a/libvpx/test/video_source.h
+++ b/libvpx/test/video_source.h

@@ -92,12 +92,7 @@
  protected:
   void CloseFile() {
     if (file_) {
-      // Close if file pointer is associated with an open file
-#if defined(_WIN32)
-      if (file_->_ptr != NULL) fclose(file_);
-#else
-      if (fileno(file_) != -1) fclose(file_);
-#endif
+      fclose(file_);
       file_ = NULL;
     }
   }
@@ -139,8 +134,13 @@
 
 class DummyVideoSource : public VideoSource {
  public:
-  DummyVideoSource() : img_(NULL), limit_(100), width_(0), height_(0) {
-    SetSize(80, 64);
+  DummyVideoSource()
+      : img_(NULL),
+        limit_(100),
+        width_(80),
+        height_(64),
+        format_(VPX_IMG_FMT_I420) {
+    ReallocImage();
   }
 
   virtual ~DummyVideoSource() { vpx_img_free(img_); }
@@ -179,23 +179,35 @@
 
   void SetSize(unsigned int width, unsigned int height) {
     if (width != width_ || height != height_) {
-      vpx_img_free(img_);
-      raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);
       width_ = width;
       height_ = height;
+      ReallocImage();
+    }
+  }
+
+  void SetImageFormat(vpx_img_fmt_t format) {
+    if (format_ != format) {
+      format_ = format;
+      ReallocImage();
     }
   }
 
  protected:
   virtual void FillFrame() { if (img_) memset(img_->img_data, 0, raw_sz_); }
 
+  void ReallocImage() {
+    vpx_img_free(img_);
+    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
+    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+  }
+
   vpx_image_t *img_;
   size_t       raw_sz_;
   unsigned int limit_;
   unsigned int frame_;
   unsigned int width_;
   unsigned int height_;
+  vpx_img_fmt_t format_;
 };
 
 

diff --git a/libvpx/test/vp8_boolcoder_test.cc b/libvpx/test/vp8_boolcoder_test.cc
index 99b5f0c..02d7162 100644
--- a/libvpx/test/vp8_boolcoder_test.cc
+++ b/libvpx/test/vp8_boolcoder_test.cc

@@ -16,12 +16,12 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include "test/acm_random.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx/vpx_integer.h"
 
-#include "vp8/encoder/boolhuff.h"
+#include "test/acm_random.h"
 #include "vp8/decoder/dboolhuff.h"
+#include "vp8/encoder/boolhuff.h"
+#include "vpx/vpx_integer.h"
 
 namespace {
 const int num_tests = 10;

diff --git a/libvpx/test/vp8_decrypt_test.cc b/libvpx/test/vp8_decrypt_test.cc
index 470fdf1..972a1d9 100644
--- a/libvpx/test/vp8_decrypt_test.cc
+++ b/libvpx/test/vp8_decrypt_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP8Decoder decoder(dec_cfg, 0);
 
   video.Begin();

diff --git a/libvpx/test/vp8_denoiser_sse2_test.cc b/libvpx/test/vp8_denoiser_sse2_test.cc
new file mode 100644
index 0000000..e8ca8d3
--- /dev/null
+++ b/libvpx/test/vp8_denoiser_sse2_test.cc

@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 16 * 16;
+class VP8DenoiserTest : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~VP8DenoiserTest() {}
+
+  virtual void SetUp() {
+    increase_denoising_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int increase_denoising_;
+};
+
+TEST_P(VP8DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+  const int stride = 16;
+
+  // Allocate the space for input and output,
+  // where sig_block_c/_sse2 is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block_c[kNumPixels]);
+  // Since in VP8 denoiser, the source signal will be changed,
+  // we need another copy of the source signal as the input of sse2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block_sse2[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_ran =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block_sse2[j] = sig_block_c[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block_c[j] + (rnd.Rand8() % 2 == 0 ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    // Test denosiser on Y component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+
+    // Test denoiser on UV component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1));
+}  // namespace

diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc
index bdbf74e..11a653d 100644
--- a/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libvpx/test/vp8_fdct4x4_test.cc

@@ -15,10 +15,10 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include "./vp8_rtcd.h"
-
-#include "test/acm_random.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp8_rtcd.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 namespace {

diff --git a/libvpx/test/vp8_fragments_test.cc b/libvpx/test/vp8_fragments_test.cc
new file mode 100644
index 0000000..cb0d1a1
--- /dev/null
+++ b/libvpx/test/vp8_fragments_test.cc

@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class VP8FramgmentsTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::Test {
+ protected:
+  VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
+  virtual ~VP8FramgmentsTest() {}
+
+  virtual void SetUp() {
+    const unsigned long init_flags =  // NOLINT(runtime/int)
+        VPX_CODEC_USE_OUTPUT_PARTITION;
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_init_flags(init_flags);
+  }
+};
+
+TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) {
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+}  // namespace

diff --git a/libvpx/test/vp8_multi_resolution_encoder.sh b/libvpx/test/vp8_multi_resolution_encoder.sh
new file mode 100755
index 0000000..a8b7fe7
--- /dev/null
+++ b/libvpx/test/vp8_multi_resolution_encoder.sh

@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp8_multi_resolution_encoder example. To add new
+##  tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp8_mre_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8_multi_resolution_encoder_verify_environment() {
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ ! -e "${YUV_RAW_INPUT}" ]; then
+      elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+    local readonly app="vp8_multi_resolution_encoder"
+    if [ -z "$(vpx_tool_path "${app}")" ]; then
+      elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
+      return 1
+    fi
+  fi
+}
+
+# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
+# vp8_multi_resolution_encoder after building path to the executable.
+vp8_mre() {
+  local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull}
+}
+
+vp8_multi_resolution_encoder_three_formats() {
+  local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ "$(vp8_encode_available)" = "yes" ]; then
+      # Param order:
+      #  Input width
+      #  Input height
+      #  Input file path
+      #  Output file names
+      #  Output PSNR
+      vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" \
+        "${YUV_RAW_INPUT}" \
+        ${output_files} \
+        0
+
+      for output_file in ${output_files}; do
+        if [ ! -e "${output_file}" ]; then
+          elog "Missing output file: ${output_file}"
+          return 1
+        fi
+      done
+    fi
+  fi
+}
+
+vp8_mre_tests="vp8_multi_resolution_encoder_three_formats"
+run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}"

diff --git a/libvpx/test/vp9_arf_freq_test.cc b/libvpx/test/vp9_arf_freq_test.cc
new file mode 100644
index 0000000..87ff15b
--- /dev/null
+++ b/libvpx/test/vp9_arf_freq_test.cc

@@ -0,0 +1,238 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN               1000001
+#define ARF_SEEN_ONCE              1000000
+
+typedef struct {
+  const char *filename;
+  unsigned int width;
+  unsigned int height;
+  unsigned int framerate_num;
+  unsigned int framerate_den;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+  libvpx_test::TestMode mode;
+  int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+  // artificially increase framerate to trigger default check
+  {"hantro_collage_w352h288.yuv", 352, 288, 5000, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"hantro_collage_w352h288.yuv", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"rush_hour_444.y4m", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Add list of profile 2/3 test videos here ...
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+  {::libvpx_test::kOnePassGood, 2},
+  {::libvpx_test::kOnePassGood, 5},
+  {::libvpx_test::kTwoPassGood, 1},
+  {::libvpx_test::kTwoPassGood, 2},
+  {::libvpx_test::kTwoPassGood, 5},
+  {::libvpx_test::kRealTime, 5},
+};
+
+const int kMinArfVectors[] = {
+  // NOTE: 0 refers to the default built-in logic in:
+  //       vp9_rc_get_default_min_gf_interval(...)
+  0, 4, 8, 12, 15
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class ArfFreqTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<TestVideoParam, \
+                                                 TestEncodeParam, int> {
+ protected:
+  ArfFreqTest()
+      : EncoderTest(GET_PARAM(0)),
+        test_video_param_(GET_PARAM(1)),
+        test_encode_param_(GET_PARAM(2)),
+        min_arf_requested_(GET_PARAM(3)) {
+  }
+
+  virtual ~ArfFreqTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(test_encode_param_.mode);
+    if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    min_run_ = ARF_NOT_SEEN;
+    run_of_visible_frames_ = 0;
+  }
+
+  int GetNumFramesInPkt(const vpx_codec_cx_pkt_t *pkt) {
+    const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int mag = ((marker >> 3) & 3) + 1;
+    int frames = (marker & 0x7) + 1;
+    const unsigned int index_sz = 2 + mag  * frames;
+    // Check for superframe or not.
+    // Assume superframe has only one visible frame, the rest being
+    // invisible. If superframe index is not found, then there is only
+    // one frame.
+    if (!((marker & 0xe0) == 0xc0 &&
+          pkt->data.frame.sz >= index_sz &&
+          buffer[pkt->data.frame.sz - index_sz] == marker)) {
+      frames = 1;
+    }
+    return frames;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+      return;
+    const int frames = GetNumFramesInPkt(pkt);
+    if (frames == 1) {
+      run_of_visible_frames_++;
+    } else if (frames == 2) {
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
+      }
+      run_of_visible_frames_ = 1;
+    } else {
+      min_run_ = 0;
+      run_of_visible_frames_ = 1;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
+      encoder->Control(VP8E_SET_CPUUSED, test_encode_param_.cpu_used);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+      if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  int GetMinVisibleRun() const {
+    return min_run_;
+  }
+
+  int GetMinArfDistanceRequested() const {
+    if (min_arf_requested_)
+      return min_arf_requested_;
+    else
+      return vp9_rc_get_default_min_gf_interval(
+          test_video_param_.width, test_video_param_.height,
+          (double)test_video_param_.framerate_num /
+          test_video_param_.framerate_den);
+  }
+
+  TestVideoParam test_video_param_;
+  TestEncodeParam test_encode_param_;
+
+ private:
+  int min_arf_requested_;
+  int min_run_;
+  int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTest, MinArfFreqTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8)
+    init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            test_video_param_.width,
+                                            test_video_param_.height,
+                                            test_video_param_.framerate_num,
+                                            test_video_param_.framerate_den,
+                                            0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const int min_run = GetMinVisibleRun();
+  const int min_arf_dist_requested = GetMinArfDistanceRequested();
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
+    EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+  }
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    ArfFreqTest,
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kEncodeVectors),
+    ::testing::ValuesIn(kMinArfVectors));
+
+VP10_INSTANTIATE_TEST_CASE(
+    ArfFreqTest,
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kEncodeVectors),
+    ::testing::ValuesIn(kMinArfVectors));
+}  // namespace

diff --git a/libvpx/test/vp9_avg_test.cc b/libvpx/test/vp9_avg_test.cc
new file mode 100644
index 0000000..d383131
--- /dev/null
+++ b/libvpx/test/vp9_avg_test.cc

@@ -0,0 +1,314 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class AverageTestBase : public ::testing::Test {
+ public:
+  AverageTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  // Sum Pixels
+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {
+    unsigned int average = 0;
+    for (int h = 0; h < 8; ++h)
+      for (int w = 0; w < 8; ++w)
+        average += source[h * source_stride_ + w];
+    return ((average + 32) >> 6);
+  }
+
+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {
+    unsigned int average = 0;
+    for (int h = 0; h < 4; ++h)
+      for (int w = 0; w < 4; ++w)
+        average += source[h * source_stride_ + w];
+    return ((average + 8) >> 4);
+  }
+
+  void FillConstant(uint8_t fill_constant) {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = fill_constant;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = rnd_.Rand8();
+    }
+  }
+
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+
+  ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);
+
+typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
+
+class AverageTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<AvgFunc>{
+ public:
+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    unsigned int expected = 0;
+    if (GET_PARAM(3) == 8) {
+      expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    } else  if (GET_PARAM(3) == 4) {
+      expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    }
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2),
+                                          source_stride_));
+    unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2),
+                                       source_stride_);
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+    : AverageTestBase(16, GET_PARAM(0)),
+      hbuf_asm_(NULL),
+      hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    hbuf_asm_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    vpx_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    vpx_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+
+
+uint8_t* AverageTestBase::source_data_ = NULL;
+
+TEST_P(AverageTest, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, MaxValue) {
+  FillConstant(255);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, Random) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+
+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
+        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
+        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProColTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
+        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProColTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+#endif
+
+}  // namespace

diff --git a/libvpx/test/vp9_boolcoder_test.cc b/libvpx/test/vp9_boolcoder_test.cc
index c7f0cd8..c61bb4a 100644
--- a/libvpx/test/vp9_boolcoder_test.cc
+++ b/libvpx/test/vp9_boolcoder_test.cc

@@ -14,11 +14,10 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "vp9/decoder/vp9_reader.h"
-#include "vp9/encoder/vp9_writer.h"
-
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
 
 using libvpx_test::ACMRandom;
 
@@ -50,9 +49,9 @@
         const int random_seed = 6432;
         const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
-        vp9_writer bw;
+        vpx_writer bw;
         uint8_t bw_buffer[kBufferSize];
-        vp9_start_encode(&bw, bw_buffer);
+        vpx_start_encode(&bw, bw_buffer);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
         for (int i = 0; i < kBitsToTest; ++i) {
@@ -61,16 +60,16 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          vp9_write(&bw, bit, static_cast<int>(probas[i]));
+          vpx_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
-        vp9_stop_encode(&bw);
+        vpx_stop_encode(&bw);
 
         // First bit should be zero
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
-        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vpx_reader br;
+        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
@@ -78,7 +77,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
+          GTEST_ASSERT_EQ(vpx_read(&br, probas[i]), bit)
               << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;

diff --git a/libvpx/test/vp9_decrypt_test.cc b/libvpx/test/vp9_decrypt_test.cc
index 88a3c14..d988612 100644
--- a/libvpx/test/vp9_decrypt_test.cc
+++ b/libvpx/test/vp9_decrypt_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP9Decoder decoder(dec_cfg, 0);
 
   video.Begin();

diff --git a/libvpx/test/vp9_denoiser_sse2_test.cc b/libvpx/test/vp9_denoiser_sse2_test.cc
new file mode 100644
index 0000000..17c799d
--- /dev/null
+++ b/libvpx/test/vp9_denoiser_sse2_test.cc

@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 64 * 64;
+class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+ public:
+  virtual ~VP9DenoiserTest() {}
+
+  virtual void SetUp() {
+    bs_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  BLOCK_SIZE bs_;
+};
+
+TEST_P(VP9DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+
+  // Allocate the space for input and output,
+  // where sig_block is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_random =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(
+        sig_block, 64, mc_avg_block, 64, avg_block_c,
+        64, 0, bs_, motion_magnitude_random));
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
+        sig_block, 64, mc_avg_block, 64, avg_block_sse2,
+        64, 0, bs_, motion_magnitude_random));
+
+    // Test bitexactness.
+    for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
+      for (int w = 0; w < (4 << b_width_log2_lookup[bs_]); ++w) {
+        EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
+                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                      BLOCK_64X64));
+}  // namespace

diff --git a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
new file mode 100644
index 0000000..a02070e
--- /dev/null
+++ b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc

@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vp9/decoder/vp9_decoder.h"
+
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_dec_cfg_t     cfg;
+  vp9_stream_info_t       si;
+  struct VP9Decoder      *pbi;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+  vpx_decrypt_cb          decrypt_cb;
+  void                   *decrypt_state;
+  vpx_image_t             img;
+  int                     img_avail;
+  int                     flushed;
+  int                     invert_tile_order;
+  int                     frame_parallel_decode;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
+namespace {
+
+const unsigned int kFramerate = 50;
+const int kCpuUsed = 2;
+
+struct EncodePerfTestVideo {
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  {"niklas_1280_720_30.y4m", 1280, 720, 600, 10},
+};
+
+struct EncodeParameters {
+  int32_t tile_rows;
+  int32_t tile_cols;
+  int32_t lossless;
+  int32_t error_resilient;
+  int32_t frame_parallel;
+  vpx_color_space_t cs;
+  // TODO(JBB): quantizers / bitrate
+};
+
+const EncodeParameters kVP9EncodeParameterSet[] = {
+    {0, 0, 0, 1, 0, VPX_CS_BT_601},
+    {0, 0, 0, 0, 0, VPX_CS_BT_709},
+    {0, 0, 1, 0, 0, VPX_CS_BT_2020},
+    {0, 2, 0, 0, 1, VPX_CS_UNKNOWN},
+    // TODO(JBB): Test profiles (requires more work).
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class VpxEncoderParmsGetToDecoder
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+                                                 EncodePerfTestVideo> {
+ protected:
+  VpxEncoderParmsGetToDecoder()
+      : EncoderTest(GET_PARAM(0)),
+        encode_parms(GET_PARAM(1)) {
+  }
+
+  virtual ~VpxEncoderParmsGetToDecoder() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 25;
+    cfg_.g_error_resilient = encode_parms.error_resilient;
+    dec_cfg_.threads = 4;
+    test_video_ = GET_PARAM(2);
+    cfg_.rc_target_bitrate = test_video_.bitrate;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
+      encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                       encode_parms.frame_parallel);
+      encoder->Control(VP9E_SET_TILE_ROWS, encode_parms.tile_rows);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, encode_parms.tile_cols);
+      encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource& video,
+                                  libvpx_test::Decoder *decoder) {
+    vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
+    vpx_codec_alg_priv_t* priv =
+        (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
+
+    VP9Decoder* pbi = priv->pbi;
+    VP9_COMMON* common = &pbi->common;
+
+    if (encode_parms.lossless) {
+      EXPECT_EQ(common->base_qindex, 0);
+      EXPECT_EQ(common->y_dc_delta_q, 0);
+      EXPECT_EQ(common->uv_dc_delta_q, 0);
+      EXPECT_EQ(common->uv_ac_delta_q, 0);
+      EXPECT_EQ(common->tx_mode, ONLY_4X4);
+    }
+    EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+    if (encode_parms.error_resilient) {
+      EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
+      EXPECT_EQ(common->use_prev_frame_mvs, 0);
+    } else {
+      EXPECT_EQ(common->frame_parallel_decoding_mode,
+                encode_parms.frame_parallel);
+    }
+    EXPECT_EQ(common->color_space, encode_parms.cs);
+    EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
+    EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  EncodePerfTestVideo test_video_;
+
+ private:
+  EncodeParameters encode_parms;
+};
+
+// TODO(hkuang): This test conflicts with frame parallel decode. So disable it
+// for now until fix.
+TEST_P(VpxEncoderParmsGetToDecoder, DISABLED_BitstreamParms) {
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_.name)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_.name,
+                                            0, test_video_.frames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_.name,
+                                            VPX_IMG_FMT_I420,
+                                            test_video_.width,
+                                            test_video_.height,
+                                            kFramerate, 1, 0,
+                                            test_video_.frames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VpxEncoderParmsGetToDecoder,
+    ::testing::ValuesIn(kVP9EncodeParameterSet),
+    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+
+VP10_INSTANTIATE_TEST_CASE(
+    VpxEncoderParmsGetToDecoder,
+    ::testing::ValuesIn(kVP9EncodeParameterSet),
+    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+}  // namespace

diff --git a/libvpx/test/vp9_end_to_end_test.cc b/libvpx/test/vp9_end_to_end_test.cc
new file mode 100644
index 0000000..e100eb9
--- /dev/null
+++ b/libvpx/test/vp9_end_to_end_test.cc

@@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth  = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
+const double kPsnrThreshold[][5] = {
+  { 36.0, 37.0, 37.0, 37.0, 37.0 },
+  { 35.0, 36.0, 36.0, 36.0, 36.0 },
+  { 34.0, 35.0, 35.0, 35.0, 35.0 },
+  { 33.0, 34.0, 34.0, 34.0, 34.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 },
+  { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 },
+  { 29.0, 30.0, 30.0, 30.0, 30.0 },
+};
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+const TestVideoParam kTestVectors[] = {
+  {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, VPX_BITS_8, 1},
+  {"park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+  {"park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  {"park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2},
+  {"park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3},
+  {"park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3},
+  {"park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3},
+  {"park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2},
+  {"park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3},
+  {"park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3},
+  {"park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3},
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+// Encoding modes tested
+const libvpx_test::TestMode kEncodingModeVectors[] = {
+  ::libvpx_test::kTwoPassGood,
+  ::libvpx_test::kOnePassGood,
+  ::libvpx_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = {1, 2, 3, 5, 6};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, \
+                                                 TestVideoParam, int> {
+ protected:
+  EndToEndTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        test_video_param_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)),
+        psnr_(0.0),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~EndToEndTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_)
+      return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[cpu_used_][encoding_mode_];
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8)
+    init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            kWidth, kHeight,
+                                            kFramerate, 1, 0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, GetPsnrThreshold());
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    EndToEndTestLarge,
+    ::testing::ValuesIn(kEncodingModeVectors),
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kCpuUsedVectors));
+
+VP10_INSTANTIATE_TEST_CASE(
+    EndToEndTestLarge,
+    ::testing::ValuesIn(kEncodingModeVectors),
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kCpuUsedVectors));
+}  // namespace

diff --git a/libvpx/test/vp9_error_block_test.cc b/libvpx/test/vp9_error_block_test.cc
new file mode 100644
index 0000000..8c5d5a2
--- /dev/null
+++ b/libvpx/test/vp9_error_block_test.cc

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int kNumIterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  intptr_t block_size,
+                                  int64_t *ssz, int bps);
+
+typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
+                        ErrorBlockParam;
+
+class ErrorBlockTest
+  : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_     = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      coeff[j]   = rnd(2 << 20) - (1 << 20);
+      dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  int max_val = ((1 << 20) - 1);
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 5;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if ( k == 4 && (i % 9) == 0 ) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {  // Test at maximum values
+        coeff[j]   = k % 2 ? max_val : -max_val;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      } else {
+        coeff[j]   = rnd(2 << 14) - (1 << 14);
+        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_10),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_12),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace

diff --git a/libvpx/test/vp9_ethread_test.cc b/libvpx/test/vp9_ethread_test.cc
new file mode 100644
index 0000000..63f6dfe
--- /dev/null
+++ b/libvpx/test/vp9_ethread_test.cc

@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+class VPxEncoderThreadTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VPxEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoder_initialized_(false),
+        tiles_(2),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+
+    md5_.clear();
+  }
+  virtual ~VPxEncoderThreadTest() {
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = VPX_VBR;
+      cfg_.rc_2pass_vbr_minsection_pct = 5;
+      cfg_.rc_2pass_vbr_minsection_pct = 2000;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.g_error_resilient = 1;
+    }
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode 4 column tiles.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      } else {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
+        encoder->Control(VP9E_SET_AQ_MODE, 3);
+      }
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    const vpx_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
+    const vpx_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libvpx_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_.push_back(md5_res.Get());
+    }
+  }
+
+  bool encoder_initialized_;
+  int tiles_;
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libvpx_test::Decoder *decoder_;
+  std::vector<std::string> md5_;
+};
+
+TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
+  std::vector<std::string> single_thr_md5, multi_thr_md5;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
+
+  cfg_.rc_target_bitrate = 1000;
+
+  // Encode using single thread.
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  single_thr_md5 = md5_;
+  md5_.clear();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  multi_thr_md5 = md5_;
+  md5_.clear();
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(single_thr_md5, multi_thr_md5);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VPxEncoderThreadTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
+    ::testing::Range(1, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    VPxEncoderThreadTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Range(1, 3));
+}  // namespace

diff --git a/libvpx/test/vp9_frame_parallel_test.cc b/libvpx/test/vp9_frame_parallel_test.cc
new file mode 100644
index 0000000..f0df88a
--- /dev/null
+++ b/libvpx/test/vp9_frame_parallel_test.cc

@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using std::string;
+
+#if CONFIG_WEBM_IO
+
+struct PauseFileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include skipped frames.
+  const char *expected_md5;
+  const int pause_frame_num;
+};
+
+// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
+// seek to next key frame and then continue decoding until the end. Return
+// the md5 of the decoded frames which does not include skipped frames.
+string DecodeFileWithPause(const string &filename, int num_threads,
+                           int pause_num) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+  int in_frames = 0;
+  int out_frames = 0;
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = num_threads;
+  vpx_codec_flags_t flags = 0;
+  flags |= VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  do {
+    ++in_frames;
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    // Pause at specified frame number.
+    if (in_frames == pause_num) {
+      // Flush the decoder and then seek to next key frame.
+      decoder.DecodeFrame(NULL, 0);
+      video.SeekToNextKeyFrame();
+    } else {
+      video.Next();
+    }
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(in_frames, out_frames) <<
+      "Input frame count does not match output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFilesWithPause(const PauseFileList files[]) {
+  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
+  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+  // one key frame for every ten frames.
+  static const PauseFileList files[] = {
+    { "vp90-2-07-frame_parallel-1.webm",
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6 },
+    { "vp90-2-07-frame_parallel-1.webm",
+      "4bb634160c7356a8d7d4299b6dc83a45", 12 },
+    { "vp90-2-07-frame_parallel-1.webm",
+      "89772591e6ef461f9fa754f916c78ed8", 26 },
+    { NULL, NULL, 0 },
+  };
+  DecodeFilesWithPause(files);
+}
+
+struct FileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include corrupted frames.
+  const char *expected_md5;
+  // Expected number of decoded frames which does not include corrupted frames.
+  const int expected_frame_count;
+};
+
+// Decodes |filename| with |num_threads|. Return the md5 of the decoded
+// frames which does not include corrupted frames.
+string DecodeFile(const string &filename, int num_threads,
+                  int expected_frame_count) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = num_threads;
+  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  int out_frames = 0;
+  do {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    // TODO(hkuang): frame parallel mode should return an error on corruption.
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    video.Next();
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(expected_frame_count, out_frames) <<
+      "Input frame count does not match expected output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFile(iter->name, t, iter->expected_frame_count))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
+  static const FileList files[] = {
+    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 11th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-1.webm",
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
+    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 1st and 31st frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-2.webm",
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
+    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 5th and 13th frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-3.webm",
+      "8256544308de926b0681e04685b98677", 27 },
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
+
+TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
+  static const FileList files[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+    { "vp92-2-20-10bit-yuv420.webm",
+      "a16b99df180c584e8db2ffeda987d293", 10 },
+#endif
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
+#endif  // CONFIG_WEBM_IO
+}  // namespace

diff --git a/libvpx/test/vp9_intrapred_test.cc b/libvpx/test/vp9_intrapred_test.cc
new file mode 100644
index 0000000..ad3327e
--- /dev/null
+++ b/libvpx/test/vp9_intrapred_test.cc

@@ -0,0 +1,301 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using libvpx_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+// Base class for VP9 intra prediction tests.
+class VP9IntraPredBase {
+ public:
+  virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void Predict(PREDICTION_MODE mode) = 0;
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    for (int y = 0; y < block_size_; y++) {
+      for (int x = 0; x < block_size_; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number "<< test_case_number;
+        }
+      }
+    }
+  }
+
+  void RunTest(uint16_t* left_col, uint16_t* above_data,
+               uint16_t* dst, uint16_t* ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    above_row_ = above_data + 16;
+    int error_count = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_size_*2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_size_; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+      Predict(DC_PRED);
+      CheckPrediction(i, &error_count);
+    }
+    ASSERT_EQ(0, error_count);
+  }
+
+  int block_size_;
+  uint16_t *above_row_;
+  uint16_t *left_col_;
+  uint16_t *dst_;
+  uint16_t *ref_dst_;
+  ptrdiff_t stride_;
+  int mask_;
+};
+
+typedef void (*intra_pred_fn_t)(
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+      const uint16_t *left, int bps);
+typedef std::tr1::tuple<intra_pred_fn_t,
+                        intra_pred_fn_t, int, int> intra_pred_params_t;
+class VP9IntraPredTest
+    : public VP9IntraPredBase,
+      public ::testing::TestWithParam<intra_pred_params_t> {
+
+  virtual void SetUp() {
+    pred_fn_    = GET_PARAM(0);
+    ref_fn_     = GET_PARAM(1);
+    block_size_ = GET_PARAM(2);
+    bit_depth_  = GET_PARAM(3);
+    stride_     = block_size_ * 3;
+    mask_       = (1 << bit_depth_) - 1;
+  }
+
+  virtual void Predict(PREDICTION_MODE mode) {
+    const uint16_t *const_above_row = above_row_;
+    const uint16_t *const_left_col = left_col_;
+    ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row,
+                                      const_left_col, bit_depth_));
+  }
+  intra_pred_fn_t pred_fn_;
+  intra_pred_fn_t ref_fn_;
+  int bit_depth_;
+};
+
+TEST_P(VP9IntraPredTest, IntraPredTests) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint16_t, left_col[2*32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2*32+32]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_USE_X86INC
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+#endif  // !ARCH_X86_64
+
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16, 10),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32, 10),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+#endif  // !ARCH_X86_64
+
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16, 12),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32, 12),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+#endif  // !ARCH_X86_64
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+}  // namespace

diff --git a/libvpx/test/vp9_lossless_test.cc b/libvpx/test/vp9_lossless_test.cc
index b3b9c92..09c1070 100644
--- a/libvpx/test/vp9_lossless_test.cc
+++ b/libvpx/test/vp9_lossless_test.cc

@@ -7,8 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./vpx_config.h"
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -19,17 +21,17 @@
 
 const int kMaxPsnr = 100;
 
-class LosslessTestLarge : public ::libvpx_test::EncoderTest,
+class LosslessTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
-  LosslessTestLarge()
+  LosslessTest()
       : EncoderTest(GET_PARAM(0)),
         psnr_(kMaxPsnr),
         nframes_(0),
         encoding_mode_(GET_PARAM(1)) {
   }
 
-  virtual ~LosslessTestLarge() {}
+  virtual ~LosslessTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -67,7 +69,7 @@
   libvpx_test::TestMode encoding_mode_;
 };
 
-TEST_P(LosslessTestLarge, TestLossLessEncoding) {
+TEST_P(LosslessTest, TestLossLessEncoding) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -85,7 +87,7 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
+TEST_P(LosslessTest, TestLossLessEncoding444) {
   libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
 
   cfg_.g_profile = 1;
@@ -102,7 +104,7 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+TEST_P(LosslessTest, TestLossLessEncodingCtrl) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -121,5 +123,12 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-VP9_INSTANTIATE_TEST_CASE(LosslessTestLarge, ALL_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(LosslessTest,
+                          ::testing::Values(::libvpx_test::kRealTime,
+                                            ::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kTwoPassGood));
+
+VP10_INSTANTIATE_TEST_CASE(LosslessTest,
+                           ::testing::Values(::libvpx_test::kOnePassGood,
+                                             ::libvpx_test::kTwoPassGood));
 }  // namespace

diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc
new file mode 100644
index 0000000..81d31fd
--- /dev/null
+++ b/libvpx/test/vp9_quantize_test.cc

@@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 100;
+
+typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
+                             int skip_block, const int16_t *zbin,
+                             const int16_t *round, const int16_t *quant,
+                             const int16_t *quant_shift,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant,
+                             uint16_t *eob, const int16_t *scan,
+                             const int16_t *iscan);
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+    QuantizeParam;
+
+class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9QuantizeTest() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9Quantize32Test() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+TEST_P(VP9QuantizeTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = TX_32X32;
+    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9QuantizeTest, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    // Two random entries
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    // Two random entries
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_8),
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_10),
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9Quantize32Test,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace

diff --git a/libvpx/test/vp9_skip_loopfilter_test.cc b/libvpx/test/vp9_skip_loopfilter_test.cc
new file mode 100644
index 0000000..b0cc7ba
--- /dev/null
+++ b/libvpx/test/vp9_skip_loopfilter_test.cc

@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const char kVp9TestFile[] = "vp90-2-08-tile_1x8_frame_parallel.webm";
+const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5";
+
+// Class for testing shutting off the loop filter.
+class SkipLoopFilterTest {
+ public:
+  SkipLoopFilterTest()
+      : video_(NULL),
+        decoder_(NULL),
+        md5_file_(NULL) {}
+
+  ~SkipLoopFilterTest() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+    delete decoder_;
+    delete video_;
+  }
+
+  // If |threads| > 0 then set the decoder with that number of threads.
+  void Init(int num_threads) {
+    expected_md5_[0] = '\0';
+    junk_[0] = '\0';
+    video_ = new libvpx_test::WebMVideoSource(kVp9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    if (num_threads > 0)
+      cfg.threads = num_threads;
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+
+    OpenMd5File(kVp9Md5File);
+  }
+
+  // Set the VP9 skipLoopFilter control value.
+  void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) {
+    decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    if (res == VPX_CODEC_OK) {
+      ReadMd5();
+      video_->Next();
+    }
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      ReadMd5();
+    }
+    return VPX_CODEC_OK;
+  }
+
+  // Checks if MD5 matches or doesn't.
+  void CheckMd5(bool matches) {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = dec_iter.Next();
+    CheckMd5Vpx(*img, matches);
+  }
+
+ private:
+  // TODO(fgalligan): Move the MD5 testing code into another class.
+  void OpenMd5File(const std::string &md5_file_name) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
+    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
+        << md5_file_name;
+  }
+
+  // Reads the next line of the MD5 file.
+  void ReadMd5() {
+    ASSERT_TRUE(md5_file_ != NULL);
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5_, junk_);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5_[32] = '\0';
+  }
+
+  // Checks if the last read MD5 matches |img| or doesn't.
+  void CheckMd5Vpx(const vpx_image_t &img, bool matches) {
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check MD5.
+    if (matches)
+      ASSERT_STREQ(expected_md5_, actual_md5) << "MD5 checksums don't match";
+    else
+      ASSERT_STRNE(expected_md5_, actual_md5) << "MD5 checksums match";
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  FILE *md5_file_;
+  char expected_md5_[33];
+  char junk_[128];
+};
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) {
+  const int non_zero_value = 1;
+  const int num_threads = 1;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) {
+  const int non_zero_value = 1;
+  const int num_threads = 8;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, WithLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(true);
+}
+
+TEST(SkipLoopFilterTest, ToggleLoopFilter) {
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+
+  for (int i = 0; i < 10; ++i) {
+    skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK);
+    ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeOneFrame());
+  }
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+}  // namespace

diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index fabb438..3cad4d7 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc

@@ -9,11 +9,13 @@
  */
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -89,15 +91,19 @@
 }
 
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_c));
+                        ::testing::Values(vpx_subtract_block_c));
 
 #if HAVE_SSE2 && CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_sse2));
+                        ::testing::Values(vpx_subtract_block_sse2));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_neon));
+                        ::testing::Values(vpx_subtract_block_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_msa));
 #endif
 
 }  // namespace vp9

diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
index d7fc4ee..233e1b1 100644
--- a/libvpx/test/vp9_thread_test.cc
+++ b/libvpx/test/vp9_thread_test.cc

@@ -18,33 +18,33 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vp9/common/vp9_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 namespace {
 
 using std::string;
 
-class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
+class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  virtual ~VP9WorkerThreadTest() {}
+  virtual ~VPxWorkerThreadTest() {}
   virtual void SetUp() {
-    vp9_get_worker_interface()->init(&worker_);
+    vpx_get_worker_interface()->init(&worker_);
   }
 
   virtual void TearDown() {
-    vp9_get_worker_interface()->end(&worker_);
+    vpx_get_worker_interface()->end(&worker_);
   }
 
-  void Run(VP9Worker* worker) {
+  void Run(VPxWorker* worker) {
     const bool synchronous = GetParam();
     if (synchronous) {
-      vp9_get_worker_interface()->execute(worker);
+      vpx_get_worker_interface()->execute(worker);
     } else {
-      vp9_get_worker_interface()->launch(worker);
+      vpx_get_worker_interface()->launch(worker);
     }
   }
 
-  VP9Worker worker_;
+  VPxWorker worker_;
 };
 
 int ThreadHook(void* data, void* return_value) {
@@ -53,12 +53,12 @@
   return *reinterpret_cast<int*>(return_value);
 }
 
-TEST_P(VP9WorkerThreadTest, HookSuccess) {
+TEST_P(VPxWorkerThreadTest, HookSuccess) {
   // should be a no-op.
-  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+  EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
 
   for (int i = 0; i < 2; ++i) {
-    EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+    EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
 
     int hook_data = 0;
     int return_value = 1;  // return successfully from the hook
@@ -67,17 +67,17 @@
     worker_.data2 = &return_value;
 
     Run(&worker_);
-    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+    EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
     EXPECT_FALSE(worker_.had_error);
     EXPECT_EQ(5, hook_data);
 
     // should be a no-op.
-    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+    EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
   }
 }
 
-TEST_P(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+TEST_P(VPxWorkerThreadTest, HookFailure) {
+  EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
 
   int hook_data = 0;
   int return_value = 0;  // return failure from the hook
@@ -86,29 +86,29 @@
   worker_.data2 = &return_value;
 
   Run(&worker_);
-  EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
+  EXPECT_FALSE(vpx_get_worker_interface()->sync(&worker_));
   EXPECT_EQ(1, worker_.had_error);
 
   // Ensure _reset() clears the error and _launch() can be called again.
   return_value = 1;
-  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
+  EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
   EXPECT_FALSE(worker_.had_error);
-  vp9_get_worker_interface()->launch(&worker_);
-  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
+  vpx_get_worker_interface()->launch(&worker_);
+  EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
   EXPECT_FALSE(worker_.had_error);
 }
 
-TEST_P(VP9WorkerThreadTest, EndWithoutSync) {
+TEST_P(VPxWorkerThreadTest, EndWithoutSync) {
   // Create a large number of threads to increase the chances of detecting a
   // race. Doing more work in the hook is no guarantee as any race would occur
   // post hook execution in the main thread loop driver.
   static const int kNumWorkers = 64;
-  VP9Worker workers[kNumWorkers];
+  VPxWorker workers[kNumWorkers];
   int hook_data[kNumWorkers];
   int return_value[kNumWorkers];
 
   for (int n = 0; n < kNumWorkers; ++n) {
-    vp9_get_worker_interface()->init(&workers[n]);
+    vpx_get_worker_interface()->init(&workers[n]);
     return_value[n] = 1;  // return successfully from the hook
     workers[n].hook = ThreadHook;
     workers[n].data1 = &hook_data[n];
@@ -117,7 +117,7 @@
 
   for (int i = 0; i < 2; ++i) {
     for (int n = 0; n < kNumWorkers; ++n) {
-      EXPECT_NE(vp9_get_worker_interface()->reset(&workers[n]), 0);
+      EXPECT_NE(vpx_get_worker_interface()->reset(&workers[n]), 0);
       hook_data[n] = 0;
     }
 
@@ -126,16 +126,16 @@
     }
 
     for (int n = kNumWorkers - 1; n >= 0; --n) {
-      vp9_get_worker_interface()->end(&workers[n]);
+      vpx_get_worker_interface()->end(&workers[n]);
     }
   }
 }
 
-TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
-  EXPECT_EQ(0, vp9_set_worker_interface(NULL));
-  EXPECT_TRUE(vp9_get_worker_interface() != NULL);
+TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
+  EXPECT_EQ(0, vpx_set_worker_interface(NULL));
+  EXPECT_TRUE(vpx_get_worker_interface() != NULL);
   for (int i = 0; i < 6; ++i) {
-    VP9WorkerInterface winterface = *vp9_get_worker_interface();
+    VPxWorkerInterface winterface = *vpx_get_worker_interface();
     switch (i) {
       default:
       case 0: winterface.init = NULL; break;
@@ -145,7 +145,7 @@
       case 4: winterface.execute = NULL; break;
       case 5: winterface.end = NULL; break;
     }
-    EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
+    EXPECT_EQ(0, vpx_set_worker_interface(&winterface));
   }
 }
 
@@ -163,7 +163,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = num_threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
@@ -202,21 +202,21 @@
 // hang.
 namespace impl {
 
-void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
-int Reset(VP9Worker *const /*worker*/) { return 1; }
-int Sync(VP9Worker *const worker) { return !worker->had_error; }
+void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VPxWorker *const /*worker*/) { return 1; }
+int Sync(VPxWorker *const worker) { return !worker->had_error; }
 
-void Execute(VP9Worker *const worker) {
-  worker->had_error |= worker->hook(worker->data1, worker->data2);
+void Execute(VPxWorker *const worker) {
+  worker->had_error |= !worker->hook(worker->data1, worker->data2);
 }
 
-void Launch(VP9Worker *const worker) { Execute(worker); }
-void End(VP9Worker *const /*worker*/) {}
+void Launch(VPxWorker *const worker) { Execute(worker); }
+void End(VPxWorker *const /*worker*/) {}
 
 }  // namespace impl
 
-TEST(VP9WorkerThreadTest, TestSerialInterface) {
-  static const VP9WorkerInterface serial_interface = {
+TEST(VPxWorkerThreadTest, TestSerialInterface) {
+  static const VPxWorkerInterface serial_interface = {
     impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
   };
   // TODO(jzern): Avoid using a file that will use the row-based thread
@@ -225,13 +225,13 @@
   // progress in the row above before proceeding.
   static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
   static const char filename[] = "vp90-2-03-size-226x226.webm";
-  VP9WorkerInterface default_interface = *vp9_get_worker_interface();
+  VPxWorkerInterface default_interface = *vpx_get_worker_interface();
 
-  EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
+  EXPECT_NE(vpx_set_worker_interface(&serial_interface), 0);
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 
   // Reset the interface.
-  EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
+  EXPECT_NE(vpx_set_worker_interface(&default_interface), 0);
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
@@ -309,6 +309,6 @@
 }
 #endif  // CONFIG_WEBM_IO
 
-INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool());
+INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
 
 }  // namespace

diff --git a/libvpx/test/vpx_scale_test.cc b/libvpx/test/vpx_scale_test.cc
index b3302d9..ef716fc 100644
--- a/libvpx/test/vpx_scale_test.cc
+++ b/libvpx/test/vpx_scale_test.cc

@@ -10,11 +10,10 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
 
@@ -33,10 +32,10 @@
   void ResetImage(int width, int height) {
     width_ = width;
     height_ = height;
-    vpx_memset(&img_, 0, sizeof(img_));
+    memset(&img_, 0, sizeof(img_));
     ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
                                              VP8BORDERINPIXELS));
-    vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
+    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
     FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
               img_.y_stride);
     FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
@@ -44,15 +43,15 @@
     FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
               img_.uv_stride);
 
-    vpx_memset(&ref_img_, 0, sizeof(ref_img_));
+    memset(&ref_img_, 0, sizeof(ref_img_));
     ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
                                              VP8BORDERINPIXELS));
-    vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
+    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
 
-    vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+    memset(&cpy_img_, 0, sizeof(cpy_img_));
     ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
                                              VP8BORDERINPIXELS));
-    vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
     ReferenceCopyFrame();
   }
 
@@ -87,8 +86,8 @@
 
     // Fill the border pixels from the nearest image pixel.
     for (int y = 0; y < crop_height; ++y) {
-      vpx_memset(left, left[padding], padding);
-      vpx_memset(right, right[-1], right_extend);
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
       left += stride;
       right += stride;
     }
@@ -101,13 +100,13 @@
 
     // The first row was already extended to the left and right. Copy it up.
     for (int y = 0; y < padding; ++y) {
-      vpx_memcpy(top, left, extend_width);
+      memcpy(top, left, extend_width);
       top += stride;
     }
 
     uint8_t *bottom = left + (crop_height * stride);
     for (int y = 0; y <  bottom_extend; ++y) {
-      vpx_memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
       bottom += stride;
     }
   }

diff --git a/libvpx/test/vpxdec.sh b/libvpx/test/vpxdec.sh
index 836b13c..de51c80 100755
--- a/libvpx/test/vpxdec.sh
+++ b/libvpx/test/vpxdec.sh

@@ -16,15 +16,16 @@
 
 # Environment check: Make sure input is available.
 vpxdec_verify_environment() {
-  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
-}
-
-# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
-vpxdec_available() {
-  [ -n "$(vpx_tool_available vpxdec)" ] && echo yes
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
 }
 
 # Wrapper function for running vpxdec with pipe input. Requires that
@@ -32,8 +33,8 @@
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxdec.
 vpxdec_pipe() {
-  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-  local input="$1"
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
 }
@@ -42,22 +43,20 @@
 # the directory containing vpxdec. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxdec.
 vpxdec() {
-  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-  local input="${1}"
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
 }
 
 vpxdec_can_decode_vp8() {
-  if [ "$(vpxdec_available)" = "yes" ] && \
-     [ "$(vp8_decode_available)" = "yes" ]; then
+  if [ "$(vp8_decode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
 vpxdec_can_decode_vp9() {
-  if [ "$(vpxdec_available)" = "yes" ] && \
-     [ "$(vp9_decode_available)" = "yes" ]; then
+  if [ "$(vp9_decode_available)" = "yes" ]; then
     echo yes
   fi
 }
@@ -81,8 +80,37 @@
   fi
 }
 
+vpxdec_vp9_webm_frame_parallel() {
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    for threads in 2 3 4 5 6 7 8; do
+      vpxdec "${VP9_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \
+        --frame-parallel
+    done
+  fi
+}
+
+vpxdec_vp9_webm_less_than_50_frames() {
+  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
+  # frames in actual webm_read_frame calls.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly decoder="$(vpx_tool_path vpxdec)"
+    local readonly expected=10
+    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
+}
+
 vpxdec_tests="vpxdec_vp8_ivf
               vpxdec_vp8_ivf_pipe_input
-              vpxdec_vp9_webm"
+              vpxdec_vp9_webm
+              vpxdec_vp9_webm_frame_parallel
+              vpxdec_vp9_webm_less_than_50_frames"
 
 run_tests vpxdec_verify_environment "${vpxdec_tests}"

diff --git a/libvpx/test/vpxenc.sh b/libvpx/test/vpxenc.sh
index 6e9ad35..e899499 100755
--- a/libvpx/test/vpxenc.sh
+++ b/libvpx/test/vpxenc.sh

@@ -20,28 +20,74 @@
 # Environment check: Make sure input is available.
 vpxenc_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+      elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+      elog "LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
     return 1
   fi
 }
 
 vpxenc_can_encode_vp8() {
-  if [ "$(vpxenc_available)" = "yes" ] && \
-     [ "$(vp8_encode_available)" = "yes" ]; then
+  if [ "$(vp8_encode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
 vpxenc_can_encode_vp9() {
-  if [ "$(vpxenc_available)" = "yes" ] && \
-     [ "$(vp9_encode_available)" = "yes" ]; then
+  if [ "$(vp9_encode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
-# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
-vpxenc_available() {
-  [ -n "$(vpx_tool_available vpxenc)" ] && echo yes
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+y4m_input_non_square_par() {
+  echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+  local readonly codec="${1:-vp8}"
+  echo "--codec=${codec}
+    --buf-initial-sz=500
+    --buf-optimal-sz=600
+    --buf-sz=1000
+    --cpu-used=-6
+    --end-usage=cbr
+    --error-resilient=1
+    --kf-max-dist=90000
+    --lag-in-frames=0
+    --max-intra-rate=300
+    --max-q=56
+    --min-q=2
+    --noise-sensitivity=0
+    --overshoot-pct=50
+    --passes=1
+    --profile=0
+    --resize-allowed=0
+    --rt
+    --static-thresh=0
+    --undershoot-pct=50"
 }
 
 # Wrapper function for running vpxenc with pipe input. Requires that
@@ -49,51 +95,34 @@
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxenc.
 vpxenc_pipe() {
-  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
   local readonly input="$1"
   shift
-  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - "$@" ${devnull}
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
+    --test-decode=fatal \
+    "$@" ${devnull}
 }
 
 # Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to
 # the directory containing vpxenc. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxenc.
 vpxenc() {
-  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
-  local readonly input="${1}"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly input="$1"
   shift
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" "$@" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
+    --test-decode=fatal \
+    "$@" ${devnull}
 }
 
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-  fi
-}
-
-vpxenc_vp8_ivf_piped_input() {
-  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
-    cat "${YUV_RAW_INPUT}" \
-      | vpxenc --codec=vp8 \
-        --width="${YUV_RAW_INPUT_WIDTH}" \
-        --height="${YUV_RAW_INPUT_HEIGHT}" \
-        --limit="${TEST_FRAMES}" \
-        --ivf \
-        --output="${output}" \
-        -
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -106,12 +135,78 @@
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp8) \
+      --output="${output}"
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --auto-alt-ref=1 \
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_ivf_piped_input() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    vpxenc_pipe $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -123,13 +218,11 @@
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -142,12 +235,99 @@
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp9) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --frame-parallel=1 \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
+vpxenc_vp9_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --passes=2
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -159,15 +339,12 @@
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
-      --lossless=1 \
-      --test-decode=fatal \
-      "${YUV_RAW_INPUT}"
+      --lossless=1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -179,16 +356,51 @@
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
       --min-q=0 \
-      --max-q=0 \
-      --test-decode=fatal \
-      "${YUV_RAW_INPUT}"
+      --max-q=0
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --passes=2 \
+      --auto-alt-ref=1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+vpxenc_vp9_webm_non_square_par() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    vpxenc $(y4m_input_non_square_par) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -199,10 +411,19 @@
 
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
+              vpxenc_vp8_webm_rt
+              vpxenc_vp8_webm_2pass
+              vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
+              vpxenc_vp9_webm_rt
+              vpxenc_vp9_webm_rt_multithread_tiled
+              vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
+              vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
-              vpxenc_vp9_ivf_minq0_maxq0"
+              vpxenc_vp9_ivf_minq0_maxq0
+              vpxenc_vp9_webm_lag10_frames20
+              vpxenc_vp9_webm_non_square_par"
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"

diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index 11d3d23..650bc52 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h

@@ -69,6 +69,18 @@
     }
   }
 
+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    do {
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
   virtual const uint8_t *cxdata() const {
     return end_of_file_ ? NULL : buf_;
   }

diff --git a/libvpx/test/y4m_test.cc b/libvpx/test/y4m_test.cc
index d4a2ede..a555329 100644
--- a/libvpx/test/y4m_test.cc
+++ b/libvpx/test/y4m_test.cc

@@ -9,12 +9,14 @@
  */
 
 #include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./y4menc.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./y4menc.h"
 
 namespace {
 
@@ -57,7 +59,7 @@
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
     const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
                    img->y_chroma_shift : img->d_h);
     const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
@@ -141,11 +143,11 @@
   Y4mVideoWriteTest() {}
 
   virtual ~Y4mVideoWriteTest() {
-    CloseSource();
     delete tmpfile_;
+    input_file_ = NULL;
   }
 
-  virtual void ReplaceInputFile(FILE *input_file) {
+  void ReplaceInputFile(FILE *input_file) {
     CloseSource();
     frame_ = 0;
     input_file_ = input_file;

diff --git a/libvpx/test/yuv_video_source.h b/libvpx/test/yuv_video_source.h
new file mode 100644
index 0000000..3c852b2
--- /dev/null
+++ b/libvpx/test/yuv_video_source.h

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_YUV_VIDEO_SOURCE_H_
+#define TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "vpx/vpx_image.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+  YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
+                 unsigned int width, unsigned int height,
+                 int rate_numerator, int rate_denominator,
+                 unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(NULL),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        width_(0),
+        height_(0),
+        format_(VPX_IMG_FMT_NONE),
+        framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+    // This initializes format_, raw_size_, width_, height_ and allocates img.
+    SetSize(width, height, format);
+  }
+
+  virtual ~YUVVideoSource() {
+    vpx_img_free(img_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+                                     << file_name_;
+    if (start_)
+      fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void SetSize(unsigned int width, unsigned int height,
+                       vpx_img_fmt format) {
+    if (width != width_ || height != height_ || format != format_) {
+      vpx_img_free(img_);
+      img_ = vpx_img_alloc(NULL, format, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      format_ = format;
+      switch (format) {
+        case VPX_IMG_FMT_I420:
+          raw_size_ = width * height * 3 / 2;
+          break;
+        case VPX_IMG_FMT_I422:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I440:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I444:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42016:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42216:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44016:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44416:
+          raw_size_ = width * height * 6;
+          break;
+        default:
+          ASSERT_TRUE(0);
+      }
+    }
+  }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  vpx_image_t *img_;
+  size_t raw_size_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  vpx_img_fmt format_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_YUV_VIDEO_SOURCE_H_

diff --git a/libvpx/third_party/libwebm/Android.mk b/libvpx/third_party/libwebm/Android.mk
index 13868b6..be9d77d 100644
--- a/libvpx/third_party/libwebm/Android.mk
+++ b/libvpx/third_party/libwebm/Android.mk

@@ -1,11 +1,10 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
+LOCAL_PATH:= $(call my-dir)
 
-LOCAL_CPP_EXTENSION := .cpp
-LOCAL_SRC_FILES := mkvmuxer.cpp \
-                   mkvmuxerutil.cpp \
-                   mkvparser.cpp \
-                   mkvreader.cpp \
-                   mkvwriter.cpp
-LOCAL_MODULE := libwebm
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_SRC_FILES:= mkvparser.cpp \
+                  mkvreader.cpp \
+                  mkvmuxer.cpp \
+                  mkvmuxerutil.cpp \
+                  mkvwriter.cpp
 include $(BUILD_STATIC_LIBRARY)

diff --git a/libvpx/third_party/libwebm/PATENTS.TXT b/libvpx/third_party/libwebm/PATENTS.TXT
index 79d17d7..caedf60 100644
--- a/libvpx/third_party/libwebm/PATENTS.TXT
+++ b/libvpx/third_party/libwebm/PATENTS.TXT

@@ -17,7 +17,7 @@
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.

diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx
index 93814b7..91875e1 100644
--- a/libvpx/third_party/libwebm/README.libvpx
+++ b/libvpx/third_party/libwebm/README.libvpx

@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 249629d46c6e9391f25a90cff6d19075f47474cb
+Version: 2dec09426ab62b794464cc9971bd135b4d313e65
 License: BSD
 License File: LICENSE.txt
 

diff --git a/libvpx/third_party/libwebm/mkvmuxer.cpp b/libvpx/third_party/libwebm/mkvmuxer.cpp
index 45167ea..9be3119 100644
--- a/libvpx/third_party/libwebm/mkvmuxer.cpp
+++ b/libvpx/third_party/libwebm/mkvmuxer.cpp

@@ -65,14 +65,14 @@
 
 IMkvWriter::~IMkvWriter() {}
 
-bool WriteEbmlHeader(IMkvWriter* writer) {
+bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version) {
   // Level 0
   uint64 size = EbmlElementSize(kMkvEBMLVersion, 1ULL);
   size += EbmlElementSize(kMkvEBMLReadVersion, 1ULL);
   size += EbmlElementSize(kMkvEBMLMaxIDLength, 4ULL);
   size += EbmlElementSize(kMkvEBMLMaxSizeLength, 8ULL);
   size += EbmlElementSize(kMkvDocType, "webm");
-  size += EbmlElementSize(kMkvDocTypeVersion, 2ULL);
+  size += EbmlElementSize(kMkvDocTypeVersion, doc_type_version);
   size += EbmlElementSize(kMkvDocTypeReadVersion, 2ULL);
 
   if (!WriteEbmlMasterElement(writer, kMkvEBML, size))
@@ -87,7 +87,7 @@
     return false;
   if (!WriteEbmlElement(writer, kMkvDocType, "webm"))
     return false;
-  if (!WriteEbmlElement(writer, kMkvDocTypeVersion, 2ULL))
+  if (!WriteEbmlElement(writer, kMkvDocTypeVersion, doc_type_version))
     return false;
   if (!WriteEbmlElement(writer, kMkvDocTypeReadVersion, 2ULL))
     return false;
@@ -95,6 +95,10 @@
   return true;
 }
 
+bool WriteEbmlHeader(IMkvWriter* writer) {
+  return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
+}
+
 bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
                  mkvmuxer::int64 start, int64 size) {
   // TODO(vigneshv): Check if this is a reasonable value.
@@ -127,13 +131,40 @@
       length_(0),
       track_number_(0),
       timestamp_(0),
-      discard_padding_(0) {}
+      discard_padding_(0),
+      reference_block_timestamp_(0),
+      reference_block_timestamp_set_(false) {}
 
 Frame::~Frame() {
   delete[] frame_;
   delete[] additional_;
 }
 
+bool Frame::CopyFrom(const Frame& frame) {
+  delete[] frame_;
+  frame_ = NULL;
+  length_ = 0;
+  if (frame.length() > 0 && frame.frame() != NULL &&
+      !Init(frame.frame(), frame.length())) {
+    return false;
+  }
+  add_id_ = 0;
+  delete[] additional_;
+  additional_ = NULL;
+  additional_length_ = 0;
+  if (frame.additional_length() > 0 && frame.additional() != NULL &&
+      !AddAdditionalData(frame.additional(), frame.additional_length(),
+                         frame.add_id())) {
+    return false;
+  }
+  duration_ = frame.duration();
+  is_key_ = frame.is_key();
+  track_number_ = frame.track_number();
+  timestamp_ = frame.timestamp();
+  discard_padding_ = frame.discard_padding();
+  return true;
+}
+
 bool Frame::Init(const uint8* frame, uint64 length) {
   uint8* const data =
       new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
@@ -164,6 +195,32 @@
   return true;
 }
 
+bool Frame::IsValid() const {
+  if (length_ == 0 || !frame_) {
+    return false;
+  }
+  if ((additional_length_ != 0 && !additional_) ||
+      (additional_ != NULL && additional_length_ == 0)) {
+    return false;
+  }
+  if (track_number_ == 0 || track_number_ > kMaxTrackNumber) {
+    return false;
+  }
+  if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) {
+    return false;
+  }
+  return true;
+}
+
+bool Frame::CanBeSimpleBlock() const {
+  return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
+}
+
+void Frame::set_reference_block_timestamp(int64 reference_block_timestamp) {
+  reference_block_timestamp_ = reference_block_timestamp;
+  reference_block_timestamp_set_ = true;
+}
+
 ///////////////////////////////////////////////////////////////
 //
 // CuePoint Class
@@ -271,7 +328,7 @@
       return false;
 
     CuePoint** const cues =
-        new (std::nothrow) CuePoint* [new_capacity];  // NOLINT
+        new (std::nothrow) CuePoint*[new_capacity];  // NOLINT
     if (!cues)
       return false;
 
@@ -532,7 +589,7 @@
   const uint32 count = content_encoding_entries_size_ + 1;
 
   ContentEncoding** const content_encoding_entries =
-      new (std::nothrow) ContentEncoding* [count];  // NOLINT
+      new (std::nothrow) ContentEncoding*[count];  // NOLINT
   if (!content_encoding_entries)
     return false;
 
@@ -612,6 +669,10 @@
   if (!writer)
     return false;
 
+  // mandatory elements without a default value.
+  if (!type_ || !codec_id_)
+    return false;
+
   // |size| may be bigger than what is written out in this function because
   // derived classes may write out more data in the Track element.
   const uint64 payload_size = PayloadSize();
@@ -619,10 +680,6 @@
   if (!WriteEbmlMasterElement(writer, kMkvTrackEntry, payload_size))
     return false;
 
-  // |type_| has to be specified before the Track can be written.
-  if (!type_)
-    return false;
-
   uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
   size += EbmlElementSize(kMkvTrackUID, uid_);
   size += EbmlElementSize(kMkvTrackType, type_);
@@ -793,6 +850,10 @@
     : Track(seed),
       display_height_(0),
       display_width_(0),
+      crop_left_(0),
+      crop_right_(0),
+      crop_top_(0),
+      crop_bottom_(0),
       frame_rate_(0.0),
       height_(0),
       stereo_mode_(0),
@@ -846,27 +907,50 @@
     return false;
   if (!WriteEbmlElement(writer, kMkvPixelHeight, height_))
     return false;
-  if (display_width_ > 0)
+  if (display_width_ > 0) {
     if (!WriteEbmlElement(writer, kMkvDisplayWidth, display_width_))
       return false;
-  if (display_height_ > 0)
+  }
+  if (display_height_ > 0) {
     if (!WriteEbmlElement(writer, kMkvDisplayHeight, display_height_))
       return false;
-  if (stereo_mode_ > kMono)
+  }
+  if (crop_left_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropLeft, crop_left_))
+      return false;
+  }
+  if (crop_right_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropRight, crop_right_))
+      return false;
+  }
+  if (crop_top_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropTop, crop_top_))
+      return false;
+  }
+  if (crop_bottom_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropBottom, crop_bottom_))
+      return false;
+  }
+  if (stereo_mode_ > kMono) {
     if (!WriteEbmlElement(writer, kMkvStereoMode, stereo_mode_))
       return false;
-  if (alpha_mode_ > kNoAlpha)
+  }
+  if (alpha_mode_ > kNoAlpha) {
     if (!WriteEbmlElement(writer, kMkvAlphaMode, alpha_mode_))
       return false;
-  if (frame_rate_ > 0.0)
+  }
+  if (frame_rate_ > 0.0) {
     if (!WriteEbmlElement(writer, kMkvFrameRate,
-                          static_cast<float>(frame_rate_)))
+                          static_cast<float>(frame_rate_))) {
       return false;
+    }
+  }
 
   const int64 stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64>(size)) {
     return false;
+  }
 
   return true;
 }
@@ -878,6 +962,14 @@
     size += EbmlElementSize(kMkvDisplayWidth, display_width_);
   if (display_height_ > 0)
     size += EbmlElementSize(kMkvDisplayHeight, display_height_);
+  if (crop_left_ > 0)
+    size += EbmlElementSize(kMkvPixelCropLeft, crop_left_);
+  if (crop_right_ > 0)
+    size += EbmlElementSize(kMkvPixelCropRight, crop_right_);
+  if (crop_top_ > 0)
+    size += EbmlElementSize(kMkvPixelCropTop, crop_top_);
+  if (crop_bottom_ > 0)
+    size += EbmlElementSize(kMkvPixelCropBottom, crop_bottom_);
   if (stereo_mode_ > kMono)
     size += EbmlElementSize(kMkvStereoMode, stereo_mode_);
   if (alpha_mode_ > kNoAlpha)
@@ -953,6 +1045,7 @@
 const char Tracks::kVorbisCodecId[] = "A_VORBIS";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
+const char Tracks::kVp10CodecId[] = "V_VP10";
 
 Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0) {}
 
@@ -990,7 +1083,7 @@
 
   const uint32 count = track_entries_size_ + 1;
 
-  Track** const track_entries = new (std::nothrow) Track* [count];  // NOLINT
+  Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
   if (!track_entries)
     return false;
 
@@ -1145,6 +1238,8 @@
 
 void Chapter::Init(unsigned int* seed) {
   id_ = NULL;
+  start_timecode_ = 0;
+  end_timecode_ = 0;
   displays_ = NULL;
   displays_size_ = 0;
   displays_count_ = 0;
@@ -1420,11 +1515,242 @@
   return edition_size;
 }
 
+// Tag Class
+
+bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) {
+  if (!ExpandSimpleTagsArray())
+    return false;
+
+  SimpleTag& st = simple_tags_[simple_tags_count_++];
+  st.Init();
+
+  if (!st.set_tag_name(tag_name))
+    return false;
+
+  if (!st.set_tag_string(tag_string))
+    return false;
+
+  return true;
+}
+
+Tag::Tag() {
+  simple_tags_ = NULL;
+  simple_tags_size_ = 0;
+  simple_tags_count_ = 0;
+}
+
+Tag::~Tag() {}
+
+void Tag::ShallowCopy(Tag* dst) const {
+  dst->simple_tags_ = simple_tags_;
+  dst->simple_tags_size_ = simple_tags_size_;
+  dst->simple_tags_count_ = simple_tags_count_;
+}
+
+void Tag::Clear() {
+  while (simple_tags_count_ > 0) {
+    SimpleTag& st = simple_tags_[--simple_tags_count_];
+    st.Clear();
+  }
+
+  delete[] simple_tags_;
+  simple_tags_ = NULL;
+
+  simple_tags_size_ = 0;
+}
+
+bool Tag::ExpandSimpleTagsArray() {
+  if (simple_tags_size_ > simple_tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_;
+
+  SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size];  // NOLINT
+  if (simple_tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    simple_tags[idx] = simple_tags_[idx];  // shallow copy
+  }
+
+  delete[] simple_tags_;
+
+  simple_tags_ = simple_tags;
+  simple_tags_size_ = size;
+
+  return true;
+}
+
+uint64 Tag::Write(IMkvWriter* writer) const {
+  uint64 payload_size = 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+    payload_size += st.Write(NULL);
+  }
+
+  const uint64 tag_size =
+      EbmlMasterElementSize(kMkvTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return tag_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvTag, payload_size))
+    return 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+
+    if (!st.Write(writer))
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != tag_size)
+    return 0;
+
+  return tag_size;
+}
+
+// Tag::SimpleTag
+
+void Tag::SimpleTag::Init() {
+  tag_name_ = NULL;
+  tag_string_ = NULL;
+}
+
+void Tag::SimpleTag::Clear() {
+  StrCpy(NULL, &tag_name_);
+  StrCpy(NULL, &tag_string_);
+}
+
+bool Tag::SimpleTag::set_tag_name(const char* tag_name) {
+  return StrCpy(tag_name, &tag_name_);
+}
+
+bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
+  return StrCpy(tag_string, &tag_string_);
+}
+
+uint64 Tag::SimpleTag::Write(IMkvWriter* writer) const {
+  uint64 payload_size = EbmlElementSize(kMkvTagName, tag_name_);
+
+  payload_size += EbmlElementSize(kMkvTagString, tag_string_);
+
+  const uint64 simple_tag_size =
+      EbmlMasterElementSize(kMkvSimpleTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return simple_tag_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvSimpleTag, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvTagName, tag_name_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvTagString, tag_string_))
+    return 0;
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != simple_tag_size)
+    return 0;
+
+  return simple_tag_size;
+}
+
+// Tags Class
+
+Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {}
+
+Tags::~Tags() {
+  while (tags_count_ > 0) {
+    Tag& tag = tags_[--tags_count_];
+    tag.Clear();
+  }
+
+  delete[] tags_;
+  tags_ = NULL;
+}
+
+int Tags::Count() const { return tags_count_; }
+
+Tag* Tags::AddTag() {
+  if (!ExpandTagsArray())
+    return NULL;
+
+  Tag& tag = tags_[tags_count_++];
+
+  return &tag;
+}
+
+bool Tags::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  uint64 payload_size = 0;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+    payload_size += tag.Write(NULL);
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvTags, payload_size))
+    return false;
+
+  const int64 start = writer->Position();
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+
+    const uint64 tag_size = tag.Write(writer);
+    if (tag_size == 0)  // error
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (tags_size_ > tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_;
+
+  Tag* const tags = new (std::nothrow) Tag[size];  // NOLINT
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& src = tags_[idx];
+    Tag* const dst = tags + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] tags_;
+
+  tags_ = tags;
+  tags_size_ = size;
+
+  return true;
+}
+
 ///////////////////////////////////////////////////////////////
 //
 // Cluster class
 
-Cluster::Cluster(uint64 timecode, int64 cues_pos)
+Cluster::Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale)
     : blocks_added_(0),
       finalized_(false),
       header_written_(false),
@@ -1432,6 +1758,7 @@
       position_for_cues_(cues_pos),
       size_position_(-1),
       timecode_(timecode),
+      timecode_scale_(timecode_scale),
       writer_(NULL) {}
 
 Cluster::~Cluster() {}
@@ -1444,36 +1771,62 @@
   return true;
 }
 
-bool Cluster::AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+bool Cluster::AddFrame(const Frame* const frame) { return DoWriteFrame(frame); }
+
+bool Cluster::AddFrame(const uint8* data, uint64 length, uint64 track_number,
                        uint64 abs_timecode, bool is_key) {
-  return DoWriteBlock(frame, length, track_number, abs_timecode, is_key ? 1 : 0,
-                      &WriteSimpleBlock);
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
 }
 
-bool Cluster::AddFrameWithAdditional(const uint8* frame, uint64 length,
+bool Cluster::AddFrameWithAdditional(const uint8* data, uint64 length,
                                      const uint8* additional,
                                      uint64 additional_length, uint64 add_id,
                                      uint64 track_number, uint64 abs_timecode,
                                      bool is_key) {
-  return DoWriteBlockWithAdditional(
-      frame, length, additional, additional_length, add_id, track_number,
-      abs_timecode, is_key ? 1 : 0, &WriteBlockWithAdditional);
+  if (!additional || additional_length == 0) {
+    return false;
+  }
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
 }
 
-bool Cluster::AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+bool Cluster::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
                                          int64 discard_padding,
                                          uint64 track_number,
                                          uint64 abs_timecode, bool is_key) {
-  return DoWriteBlockWithDiscardPadding(
-      frame, length, discard_padding, track_number, abs_timecode,
-      is_key ? 1 : 0, &WriteBlockWithDiscardPadding);
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
 }
 
-bool Cluster::AddMetadata(const uint8* frame, uint64 length,
-                          uint64 track_number, uint64 abs_timecode,
-                          uint64 duration_timecode) {
-  return DoWriteBlock(frame, length, track_number, abs_timecode,
-                      duration_timecode, &WriteMetadataBlock);
+bool Cluster::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                          uint64 abs_timecode, uint64 duration_timecode) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_duration(duration_timecode);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return DoWriteFrame(&frame);
 }
 
 void Cluster::AddPayloadSize(uint64 size) { payload_size_ += size; }
@@ -1506,11 +1859,7 @@
   return element_size;
 }
 
-template <typename Type>
-bool Cluster::PreWriteBlock(Type* write_function) {
-  if (write_function == NULL)
-    return false;
-
+bool Cluster::PreWriteBlock() {
   if (finalized_)
     return false;
 
@@ -1527,10 +1876,6 @@
   ++blocks_added_;
 }
 
-bool Cluster::IsValidTrackNumber(uint64 track_number) const {
-  return (track_number > 0 && track_number <= 0x7E);
-}
-
 int64 Cluster::GetRelativeTimecode(int64 abs_timecode) const {
   const int64 cluster_timecode = this->Cluster::timecode();
   const int64 rel_timecode =
@@ -1542,79 +1887,14 @@
   return rel_timecode;
 }
 
-bool Cluster::DoWriteBlock(const uint8* frame, uint64 length,
-                           uint64 track_number, uint64 abs_timecode,
-                           uint64 generic_arg, WriteBlock write_block) {
-  if (frame == NULL || length == 0)
+bool Cluster::DoWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
     return false;
 
-  if (!IsValidTrackNumber(track_number))
+  if (!PreWriteBlock())
     return false;
 
-  const int64 rel_timecode = GetRelativeTimecode(abs_timecode);
-  if (rel_timecode < 0)
-    return false;
-
-  if (!PreWriteBlock(write_block))
-    return false;
-
-  const uint64 element_size = (*write_block)(
-      writer_, frame, length, track_number, rel_timecode, generic_arg);
-  if (element_size == 0)
-    return false;
-
-  PostWriteBlock(element_size);
-  return true;
-}
-
-bool Cluster::DoWriteBlockWithAdditional(
-    const uint8* frame, uint64 length, const uint8* additional,
-    uint64 additional_length, uint64 add_id, uint64 track_number,
-    uint64 abs_timecode, uint64 generic_arg, WriteBlockAdditional write_block) {
-  if (frame == NULL || length == 0 || additional == NULL ||
-      additional_length == 0)
-    return false;
-
-  if (!IsValidTrackNumber(track_number))
-    return false;
-
-  const int64 rel_timecode = GetRelativeTimecode(abs_timecode);
-  if (rel_timecode < 0)
-    return false;
-
-  if (!PreWriteBlock(write_block))
-    return false;
-
-  const uint64 element_size =
-      (*write_block)(writer_, frame, length, additional, additional_length,
-                     add_id, track_number, rel_timecode, generic_arg);
-  if (element_size == 0)
-    return false;
-
-  PostWriteBlock(element_size);
-  return true;
-}
-
-bool Cluster::DoWriteBlockWithDiscardPadding(
-    const uint8* frame, uint64 length, int64 discard_padding,
-    uint64 track_number, uint64 abs_timecode, uint64 generic_arg,
-    WriteBlockDiscardPadding write_block) {
-  if (frame == NULL || length == 0 || discard_padding <= 0)
-    return false;
-
-  if (!IsValidTrackNumber(track_number))
-    return false;
-
-  const int64 rel_timecode = GetRelativeTimecode(abs_timecode);
-  if (rel_timecode < 0)
-    return false;
-
-  if (!PreWriteBlock(write_block))
-    return false;
-
-  const uint64 element_size =
-      (*write_block)(writer_, frame, length, discard_padding, track_number,
-                     rel_timecode, generic_arg);
+  const uint64 element_size = WriteFrame(writer_, frame, this);
   if (element_size == 0)
     return false;
 
@@ -1860,7 +2140,7 @@
   if (duration_ > 0.0)
     size += EbmlElementSize(kMkvDuration, static_cast<float>(duration_));
   if (date_utc_ != LLONG_MIN)
-    size += EbmlDateElementSize(kMkvDateUTC, date_utc_);
+    size += EbmlDateElementSize(kMkvDateUTC);
   size += EbmlElementSize(kMkvMuxingApp, muxing_app_);
   size += EbmlElementSize(kMkvWritingApp, writing_app_);
 
@@ -1966,6 +2246,8 @@
       output_cues_(true),
       payload_pos_(0),
       size_position_(0),
+      doc_type_version_(kDefaultDocTypeVersion),
+      doc_type_version_written_(0),
       writer_cluster_(NULL),
       writer_cues_(NULL),
       writer_header_(NULL) {
@@ -2012,7 +2294,6 @@
 
 void Segment::MoveCuesBeforeClustersHelper(uint64 diff, int32 index,
                                            uint64* cues_size) {
-  const uint64 old_cues_size = *cues_size;
   CuePoint* const cue_point = cues_.GetCueByIndex(index);
   if (cue_point == NULL)
     return;
@@ -2020,18 +2301,19 @@
   const uint64 cluster_pos = cue_point->cluster_pos() + diff;
   cue_point->set_cluster_pos(cluster_pos);  // update the new cluster position
   // New size of the cue is computed as follows
-  //    Let a = current size of Cues Element
-  //    Let b = Difference in Cue Point's size after this pass
-  //    Let c = Difference in length of Cues Element's size
-  //            (This is computed as CodedSize(a + b) - CodedSize(a)
-  //    Let d = a + b + c. Now d is the new size of the Cues element which is
-  //                       passed on to the next recursive call.
+  //    Let a = current sum of size of all CuePoints
+  //    Let b = Increase in Cue Point's size due to this iteration
+  //    Let c = Increase in size of Cues Element's length due to this iteration
+  //            (This is computed as CodedSize(a + b) - CodedSize(a))
+  //    Let d = b + c. Now d is the |diff| passed to the next recursive call.
+  //    Let e = a + b. Now e is the |cues_size| passed to the next recursive
+  //                   call.
   const uint64 cue_point_size_diff = cue_point->Size() - old_cue_point_size;
   const uint64 cue_size_diff =
       GetCodedUIntSize(*cues_size + cue_point_size_diff) -
       GetCodedUIntSize(*cues_size);
-  *cues_size += cue_point_size_diff + cue_size_diff;
-  diff = *cues_size - old_cues_size;
+  *cues_size += cue_point_size_diff;
+  diff = cue_size_diff + cue_point_size_diff;
   if (diff > 0) {
     for (int32 i = 0; i < cues_.cue_entries_size(); ++i) {
       MoveCuesBeforeClustersHelper(diff, i, cues_size);
@@ -2041,8 +2323,10 @@
 
 void Segment::MoveCuesBeforeClusters() {
   const uint64 current_cue_size = cues_.Size();
-  uint64 cue_size = current_cue_size;
-  for (int32 i = 0; i < cues_.cue_entries_size(); i++)
+  uint64 cue_size = 0;
+  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
+    cue_size += cues_.GetCueByIndex(i)->Size();
+  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
     MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
 
   // Adjust the Seek Entry to reflect the change in position
@@ -2164,12 +2448,24 @@
       if (size_position_ == -1)
         return false;
 
-      const int64 pos = writer_header_->Position();
       const int64 segment_size = MaxOffset();
-
       if (segment_size < 1)
         return false;
 
+      const int64 pos = writer_header_->Position();
+      UpdateDocTypeVersion();
+      if (doc_type_version_ != doc_type_version_written_) {
+        if (writer_header_->Position(0))
+          return false;
+
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+          return false;
+        if (writer_header_->Position() != ebml_header_size_)
+          return false;
+
+        doc_type_version_written_ = doc_type_version_;
+      }
+
       if (writer_header_->Position(size_position_))
         return false;
 
@@ -2210,6 +2506,8 @@
 
 Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
 
+Tag* Segment::AddTag() { return tags_.AddTag(); }
+
 uint64 Segment::AddVideoTrack(int32 width, int32 height, int32 number) {
   VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
   if (!track)
@@ -2264,187 +2562,72 @@
   return track->number();
 }
 
-bool Segment::AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+bool Segment::AddFrame(const uint8* data, uint64 length, uint64 track_number,
                        uint64 timestamp, bool is_key) {
-  if (!frame)
+  if (!data)
     return false;
 
-  if (!CheckHeaderInfo())
+  Frame frame;
+  if (!frame.Init(data, length))
     return false;
-
-  // Check for non-monotonically increasing timestamps.
-  if (timestamp < last_timestamp_)
-    return false;
-
-  // If the segment has a video track hold onto audio frames to make sure the
-  // audio that is associated with the start time of a video key-frame is
-  // muxed into the same cluster.
-  if (has_video_ && tracks_.TrackIsAudio(track_number) && !force_new_cluster_) {
-    Frame* const new_frame = new (std::nothrow) Frame();
-    if (new_frame == NULL || !new_frame->Init(frame, length))
-      return false;
-    new_frame->set_track_number(track_number);
-    new_frame->set_timestamp(timestamp);
-    new_frame->set_is_key(is_key);
-
-    if (!QueueFrame(new_frame))
-      return false;
-
-    return true;
-  }
-
-  if (!DoNewClusterProcessing(track_number, timestamp, is_key))
-    return false;
-
-  if (cluster_list_size_ < 1)
-    return false;
-
-  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
-  if (!cluster)
-    return false;
-
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 abs_timecode = timestamp / timecode_scale;
-
-  if (!cluster->AddFrame(frame, length, track_number, abs_timecode, is_key))
-    return false;
-
-  if (new_cuepoint_ && cues_track_ == track_number) {
-    if (!AddCuePoint(timestamp, cues_track_))
-      return false;
-  }
-
-  if (timestamp > last_timestamp_)
-    last_timestamp_ = timestamp;
-
-  return true;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
 }
 
-bool Segment::AddFrameWithAdditional(const uint8* frame, uint64 length,
+bool Segment::AddFrameWithAdditional(const uint8* data, uint64 length,
                                      const uint8* additional,
                                      uint64 additional_length, uint64 add_id,
                                      uint64 track_number, uint64 timestamp,
                                      bool is_key) {
-  if (frame == NULL || additional == NULL)
+  if (!data || !additional)
     return false;
 
-  if (!CheckHeaderInfo())
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
     return false;
-
-  // Check for non-monotonically increasing timestamps.
-  if (timestamp < last_timestamp_)
-    return false;
-
-  // If the segment has a video track hold onto audio frames to make sure the
-  // audio that is associated with the start time of a video key-frame is
-  // muxed into the same cluster.
-  if (has_video_ && tracks_.TrackIsAudio(track_number) && !force_new_cluster_) {
-    Frame* const new_frame = new (std::nothrow) Frame();
-    if (new_frame == NULL || !new_frame->Init(frame, length))
-      return false;
-    new_frame->set_track_number(track_number);
-    new_frame->set_timestamp(timestamp);
-    new_frame->set_is_key(is_key);
-
-    if (!QueueFrame(new_frame))
-      return false;
-
-    return true;
   }
-
-  if (!DoNewClusterProcessing(track_number, timestamp, is_key))
-    return false;
-
-  if (cluster_list_size_ < 1)
-    return false;
-
-  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
-  if (cluster == NULL)
-    return false;
-
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 abs_timecode = timestamp / timecode_scale;
-
-  if (!cluster->AddFrameWithAdditional(frame, length, additional,
-                                       additional_length, add_id, track_number,
-                                       abs_timecode, is_key))
-    return false;
-
-  if (new_cuepoint_ && cues_track_ == track_number) {
-    if (!AddCuePoint(timestamp, cues_track_))
-      return false;
-  }
-
-  if (timestamp > last_timestamp_)
-    last_timestamp_ = timestamp;
-
-  return true;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
 }
 
-bool Segment::AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+bool Segment::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
                                          int64 discard_padding,
                                          uint64 track_number, uint64 timestamp,
                                          bool is_key) {
-  if (frame == NULL || discard_padding <= 0)
+  if (!data)
     return false;
 
-  if (!CheckHeaderInfo())
+  Frame frame;
+  if (!frame.Init(data, length))
     return false;
-
-  // Check for non-monotonically increasing timestamps.
-  if (timestamp < last_timestamp_)
-    return false;
-
-  // If the segment has a video track hold onto audio frames to make sure the
-  // audio that is associated with the start time of a video key-frame is
-  // muxed into the same cluster.
-  if (has_video_ && tracks_.TrackIsAudio(track_number) && !force_new_cluster_) {
-    Frame* const new_frame = new (std::nothrow) Frame();
-    if (new_frame == NULL || !new_frame->Init(frame, length))
-      return false;
-    new_frame->set_track_number(track_number);
-    new_frame->set_timestamp(timestamp);
-    new_frame->set_is_key(is_key);
-    new_frame->set_discard_padding(discard_padding);
-
-    if (!QueueFrame(new_frame))
-      return false;
-
-    return true;
-  }
-
-  if (!DoNewClusterProcessing(track_number, timestamp, is_key))
-    return false;
-
-  if (cluster_list_size_ < 1)
-    return false;
-
-  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
-  if (!cluster)
-    return false;
-
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 abs_timecode = timestamp / timecode_scale;
-
-  if (!cluster->AddFrameWithDiscardPadding(
-          frame, length, discard_padding, track_number, abs_timecode, is_key)) {
-    return false;
-  }
-
-  if (new_cuepoint_ && cues_track_ == track_number) {
-    if (!AddCuePoint(timestamp, cues_track_))
-      return false;
-  }
-
-  if (timestamp > last_timestamp_)
-    last_timestamp_ = timestamp;
-
-  return true;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
 }
 
-bool Segment::AddMetadata(const uint8* frame, uint64 length,
-                          uint64 track_number, uint64 timestamp_ns,
-                          uint64 duration_ns) {
+bool Segment::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                          uint64 timestamp_ns, uint64 duration_ns) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp_ns);
+  frame.set_duration(duration_ns);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddGenericFrame(const Frame* frame) {
   if (!frame)
     return false;
 
@@ -2452,55 +2635,71 @@
     return false;
 
   // Check for non-monotonically increasing timestamps.
-  if (timestamp_ns < last_timestamp_)
+  if (frame->timestamp() < last_timestamp_)
     return false;
 
-  if (!DoNewClusterProcessing(track_number, timestamp_ns, true))
+  // Check if the track number is valid.
+  if (!tracks_.GetTrackByNumber(frame->track_number()))
     return false;
 
+  if (frame->discard_padding() != 0)
+    doc_type_version_ = 4;
+
+  // If the segment has a video track hold onto audio frames to make sure the
+  // audio that is associated with the start time of a video key-frame is
+  // muxed into the same cluster.
+  if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
+      !force_new_cluster_) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame))
+      return false;
+    return QueueFrame(new_frame);
+  }
+
+  if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
+                              frame->is_key())) {
+    return false;
+  }
+
   if (cluster_list_size_ < 1)
     return false;
 
   Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
-
   if (!cluster)
     return false;
 
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 abs_timecode = timestamp_ns / timecode_scale;
-  const uint64 duration_timecode = duration_ns / timecode_scale;
+  // If the Frame is not a SimpleBlock, then set the reference_block_timestamp
+  // if it is not set already.
+  bool frame_created = false;
+  if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
+      !frame->reference_block_timestamp_set()) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame->CopyFrom(*frame))
+      return false;
+    new_frame->set_reference_block_timestamp(
+        last_track_timestamp_[frame->track_number() - 1]);
+    frame = new_frame;
+    frame_created = true;
+  }
 
-  if (!cluster->AddMetadata(frame, length, track_number, abs_timecode,
-                            duration_timecode))
+  if (!cluster->AddFrame(frame))
     return false;
 
-  if (timestamp_ns > last_timestamp_)
-    last_timestamp_ = timestamp_ns;
+  if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+    if (!AddCuePoint(frame->timestamp(), cues_track_))
+      return false;
+  }
+
+  last_timestamp_ = frame->timestamp();
+  last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+  last_block_duration_ = frame->duration();
+
+  if (frame_created)
+    delete frame;
 
   return true;
 }
 
-bool Segment::AddGenericFrame(const Frame* frame) {
-  last_block_duration_ = frame->duration();
-  if (!tracks_.TrackIsAudio(frame->track_number()) &&
-      !tracks_.TrackIsVideo(frame->track_number()) && frame->duration() > 0) {
-    return AddMetadata(frame->frame(), frame->length(), frame->track_number(),
-                       frame->timestamp(), frame->duration());
-  } else if (frame->additional() && frame->additional_length() > 0) {
-    return AddFrameWithAdditional(
-        frame->frame(), frame->length(), frame->additional(),
-        frame->additional_length(), frame->add_id(), frame->track_number(),
-        frame->timestamp(), frame->is_key());
-  } else if (frame->discard_padding() > 0) {
-    return AddFrameWithDiscardPadding(
-        frame->frame(), frame->length(), frame->discard_padding(),
-        frame->track_number(), frame->timestamp(), frame->is_key());
-  } else {
-    return AddFrame(frame->frame(), frame->length(), frame->track_number(),
-                    frame->timestamp(), frame->is_key());
-  }
-}
-
 void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
 
 bool Segment::SetChunking(bool chunking, const char* filename) {
@@ -2598,9 +2797,13 @@
 }
 
 bool Segment::WriteSegmentHeader() {
+  UpdateDocTypeVersion();
+
   // TODO(fgalligan): Support more than one segment.
-  if (!WriteEbmlHeader(writer_header_))
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_))
     return false;
+  doc_type_version_written_ = doc_type_version_;
+  ebml_header_size_ = static_cast<int32>(writer_header_->Position());
 
   // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
   // will write over duration when the file is finalized.
@@ -2645,6 +2848,13 @@
       return false;
   }
 
+  if (tags_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(kMkvTags, MaxOffset()))
+      return false;
+    if (!tags_.Write(writer_header_))
+      return false;
+  }
+
   if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) {
     if (!chunk_writer_header_)
       return false;
@@ -2740,7 +2950,7 @@
     const int32 new_capacity =
         (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
     Cluster** const clusters =
-        new (std::nothrow) Cluster* [new_capacity];  // NOLINT
+        new (std::nothrow) Cluster*[new_capacity];  // NOLINT
     if (!clusters)
       return false;
 
@@ -2796,7 +3006,8 @@
 
   Cluster*& cluster = cluster_list_[cluster_list_size_];
   const int64 offset = MaxOffset();
-  cluster = new (std::nothrow) Cluster(cluster_timecode, offset);  // NOLINT
+  cluster = new (std::nothrow) Cluster(cluster_timecode,  // NOLINT
+                                       offset, segment_info_.timecode_scale());
   if (!cluster)
     return false;
 
@@ -2873,6 +3084,19 @@
   return true;
 }
 
+void Segment::UpdateDocTypeVersion() {
+  for (uint32 index = 0; index < tracks_.track_entries_size(); ++index) {
+    const Track* track = tracks_.GetTrackByIndex(index);
+    if (track == NULL)
+      break;
+    if ((track->codec_delay() || track->seek_pre_roll()) &&
+        doc_type_version_ < 4) {
+      doc_type_version_ = 4;
+      break;
+    }
+  }
+}
+
 bool Segment::UpdateChunkName(const char* ext, char** name) const {
   if (!name || !ext)
     return false;
@@ -2932,7 +3156,7 @@
     if (new_capacity < 1)
       return false;
 
-    Frame** const frames = new (std::nothrow) Frame* [new_capacity];  // NOLINT
+    Frame** const frames = new (std::nothrow) Frame*[new_capacity];  // NOLINT
     if (!frames)
       return false;
 
@@ -2962,34 +3186,24 @@
   if (!cluster)
     return -1;
 
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-
   for (int32 i = 0; i < frames_size_; ++i) {
     Frame*& frame = frames_[i];
-    const uint64 frame_timestamp = frame->timestamp();  // ns
-    const uint64 frame_timecode = frame_timestamp / timecode_scale;
-
-    if (frame->discard_padding() > 0) {
-      if (!cluster->AddFrameWithDiscardPadding(
-              frame->frame(), frame->length(), frame->discard_padding(),
-              frame->track_number(), frame_timecode, frame->is_key())) {
-        return -1;
-      }
-    } else {
-      if (!cluster->AddFrame(frame->frame(), frame->length(),
-                             frame->track_number(), frame_timecode,
-                             frame->is_key())) {
-        return -1;
-      }
-    }
+    // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
+    // places where |doc_type_version_| needs to be updated.
+    if (frame->discard_padding() != 0)
+      doc_type_version_ = 4;
+    if (!cluster->AddFrame(frame))
+      return -1;
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame_timestamp, cues_track_))
+      if (!AddCuePoint(frame->timestamp(), cues_track_))
         return -1;
     }
 
-    if (frame_timestamp > last_timestamp_)
-      last_timestamp_ = frame_timestamp;
+    if (frame->timestamp() > last_timestamp_) {
+      last_timestamp_ = frame->timestamp();
+      last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+    }
 
     delete frame;
     frame = NULL;
@@ -3013,7 +3227,6 @@
     if (!cluster)
       return false;
 
-    const uint64 timecode_scale = segment_info_.timecode_scale();
     int32 shift_left = 0;
 
     // TODO(fgalligan): Change this to use the durations of frames instead of
@@ -3025,33 +3238,22 @@
         break;
 
       const Frame* const frame_prev = frames_[i - 1];
-      const uint64 frame_timestamp = frame_prev->timestamp();
-      const uint64 frame_timecode = frame_timestamp / timecode_scale;
-      const int64 discard_padding = frame_prev->discard_padding();
-
-      if (discard_padding > 0) {
-        if (!cluster->AddFrameWithDiscardPadding(
-                frame_prev->frame(), frame_prev->length(), discard_padding,
-                frame_prev->track_number(), frame_timecode,
-                frame_prev->is_key())) {
-          return false;
-        }
-      } else {
-        if (!cluster->AddFrame(frame_prev->frame(), frame_prev->length(),
-                               frame_prev->track_number(), frame_timecode,
-                               frame_prev->is_key())) {
-          return false;
-        }
-      }
+      if (frame_prev->discard_padding() != 0)
+        doc_type_version_ = 4;
+      if (!cluster->AddFrame(frame_prev))
+        return false;
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_timestamp, cues_track_))
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
           return false;
       }
 
       ++shift_left;
-      if (frame_timestamp > last_timestamp_)
-        last_timestamp_ = frame_timestamp;
+      if (frame_prev->timestamp() > last_timestamp_) {
+        last_timestamp_ = frame_prev->timestamp();
+        last_track_timestamp_[frame_prev->track_number() - 1] =
+            frame_prev->timestamp();
+      }
 
       delete frame_prev;
     }

diff --git a/libvpx/third_party/libwebm/mkvmuxer.hpp b/libvpx/third_party/libwebm/mkvmuxer.hpp
index 1c1c310..497ad4c 100644
--- a/libvpx/third_party/libwebm/mkvmuxer.hpp
+++ b/libvpx/third_party/libwebm/mkvmuxer.hpp

@@ -23,6 +23,8 @@
 class MkvWriter;
 class Segment;
 
+const uint64 kMaxTrackNumber = 126;
+
 ///////////////////////////////////////////////////////////////
 // Interface used by the mkvmuxer to write out the Mkv data.
 class IMkvWriter {
@@ -57,6 +59,10 @@
 
 // Writes out the EBML header for a WebM file. This function must be called
 // before any other libwebm writing functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version);
+
+// Deprecated. Writes out EBML header with doc_type_version as
+// kDefaultDocTypeVersion. Exists for backward compatibility.
 bool WriteEbmlHeader(IMkvWriter* writer);
 
 // Copies in Chunk from source to destination between the given byte positions
@@ -70,12 +76,23 @@
   Frame();
   ~Frame();
 
+  // Sets this frame's contents based on |frame|. Returns true on success. On
+  // failure, this frame's existing contents may be lost.
+  bool CopyFrom(const Frame& frame);
+
   // Copies |frame| data into |frame_|. Returns true on success.
   bool Init(const uint8* frame, uint64 length);
 
   // Copies |additional| data into |additional_|. Returns true on success.
   bool AddAdditionalData(const uint8* additional, uint64 length, uint64 add_id);
 
+  // Returns true if the frame has valid parameters.
+  bool IsValid() const;
+
+  // Returns true if the frame can be written as a SimpleBlock based on current
+  // parameters.
+  bool CanBeSimpleBlock() const;
+
   uint64 add_id() const { return add_id_; }
   const uint8* additional() const { return additional_; }
   uint64 additional_length() const { return additional_length_; }
@@ -89,10 +106,15 @@
   uint64 track_number() const { return track_number_; }
   void set_timestamp(uint64 timestamp) { timestamp_ = timestamp; }
   uint64 timestamp() const { return timestamp_; }
-  void set_discard_padding(uint64 discard_padding) {
+  void set_discard_padding(int64 discard_padding) {
     discard_padding_ = discard_padding;
   }
-  uint64 discard_padding() const { return discard_padding_; }
+  int64 discard_padding() const { return discard_padding_; }
+  void set_reference_block_timestamp(int64 reference_block_timestamp);
+  int64 reference_block_timestamp() const { return reference_block_timestamp_; }
+  bool reference_block_timestamp_set() const {
+    return reference_block_timestamp_set_;
+  }
 
  private:
   // Id of the Additional data.
@@ -124,6 +146,14 @@
 
   // Discard padding for the frame.
   int64 discard_padding_;
+
+  // Reference block timestamp.
+  int64 reference_block_timestamp_;
+
+  // Flag indicating if |reference_block_timestamp_| has been set.
+  bool reference_block_timestamp_set_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame);
 };
 
 ///////////////////////////////////////////////////////////////
@@ -422,6 +452,16 @@
   uint64 display_height() const { return display_height_; }
   void set_display_width(uint64 width) { display_width_ = width; }
   uint64 display_width() const { return display_width_; }
+
+  void set_crop_left(uint64 crop_left) { crop_left_ = crop_left; }
+  uint64 crop_left() const { return crop_left_; }
+  void set_crop_right(uint64 crop_right) { crop_right_ = crop_right; }
+  uint64 crop_right() const { return crop_right_; }
+  void set_crop_top(uint64 crop_top) { crop_top_ = crop_top; }
+  uint64 crop_top() const { return crop_top_; }
+  void set_crop_bottom(uint64 crop_bottom) { crop_bottom_ = crop_bottom; }
+  uint64 crop_bottom() const { return crop_bottom_; }
+
   void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
   double frame_rate() const { return frame_rate_; }
   void set_height(uint64 height) { height_ = height; }
@@ -438,6 +478,10 @@
   // Video track element names.
   uint64 display_height_;
   uint64 display_width_;
+  uint64 crop_left_;
+  uint64 crop_right_;
+  uint64 crop_top_;
+  uint64 crop_bottom_;
   double frame_rate_;
   uint64 height_;
   uint64 stereo_mode_;
@@ -489,6 +533,7 @@
   static const char kVorbisCodecId[];
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
+  static const char kVp10CodecId[];
 
   Tracks();
   ~Tracks();
@@ -693,38 +738,148 @@
 };
 
 ///////////////////////////////////////////////////////////////
+// Tag element
+//
+class Tag {
+ public:
+  bool add_simple_tag(const char* tag_name, const char* tag_string);
+
+ private:
+  // Tags calls Clear and the destructor of Tag
+  friend class Tags;
+
+  // For storage of simple tags
+  class SimpleTag {
+   public:
+    // Establish representation invariant for new SimpleTag object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |tag_name_| member.  Returns false on
+    // error.
+    bool set_tag_name(const char* tag_name);
+
+    // Copies the language to the |tag_string_| member.  Returns false
+    // on error.
+    bool set_tag_string(const char* tag_string);
+
+    // If |writer| is non-NULL, serialize the SimpleTag sub-element of
+    // the Atom into the stream.  Returns the SimpleTag element size on
+    // success, 0 if error.
+    uint64 Write(IMkvWriter* writer) const;
+
+   private:
+    char* tag_name_;
+    char* tag_string_;
+  };
+
+  Tag();
+  ~Tag();
+
+  // Copies this Tag object to a different one.  This is used when
+  // expanding a plain array of Tag objects (see Tags).
+  void ShallowCopy(Tag* dst) const;
+
+  // Reclaim resources used by this Tag object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |simple_tags_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing SimpleTag objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandSimpleTagsArray();
+
+  // If |writer| is non-NULL, serialize the Tag sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64 Write(IMkvWriter* writer) const;
+
+  // The Atom element can contain multiple SimpleTag sub-elements
+  SimpleTag* simple_tags_;
+
+  // The physical length (total size) of the |simple_tags_| array.
+  int simple_tags_size_;
+
+  // The logical length (number of active elements) on the |simple_tags_|
+  // array.
+  int simple_tags_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag);
+};
+
+///////////////////////////////////////////////////////////////
+// Tags element
+//
+class Tags {
+ public:
+  Tags();
+  ~Tags();
+
+  Tag* AddTag();
+
+  // Returns the number of tags that have been added.
+  int Count() const;
+
+  // Output the Tags element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the tags_ array if there is not enough space to contain
+  // another tag object.  Returns true on success.
+  bool ExpandTagsArray();
+
+  // Total length of the tags_ array.
+  int tags_size_;
+
+  // Number of active tags on the tags_ array.
+  int tags_count_;
+
+  // Array for storage of tag objects.
+  Tag* tags_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags);
+};
+
+///////////////////////////////////////////////////////////////
 // Cluster element
 //
 // Notes:
 //  |Init| must be called before any other method in this class.
 class Cluster {
  public:
-  Cluster(uint64 timecode, int64 cues_pos);
-  ~Cluster();
-
   // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
   // position for the cluster within the segment that should be written in
-  // the cues element.
+  // the cues element. |timecode_scale| is the timecode scale of the segment.
+  Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale);
+  ~Cluster();
+
   bool Init(IMkvWriter* ptr_writer);
 
   // Adds a frame to be output in the file. The frame is written out through
   // |writer_| if successful. Returns true on success.
+  bool AddFrame(const Frame* frame);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
   // Inputs:
-  //   frame: Pointer to the data
+  //   data: Pointer to the data
   //   length: Length of the data
   //   track_number: Track to add the data to. Value returned by Add track
   //                 functions.  The range of allowed values is [1, 126].
   //   timecode:     Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
                 uint64 timecode,  // timecode units (absolute)
                 bool is_key);
 
   // Adds a frame to be output in the file. The frame is written out through
   // |writer_| if successful. Returns true on success.
   // Inputs:
-  //   frame: Pointer to the data
+  //   data: Pointer to the data
   //   length: Length of the data
   //   additional: Pointer to the additional data
   //   additional_length: Length of the additional data
@@ -734,7 +889,7 @@
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* frame, uint64 length,
+  bool AddFrameWithAdditional(const uint8* data, uint64 length,
                               const uint8* additional, uint64 additional_length,
                               uint64 add_id, uint64 track_number,
                               uint64 abs_timecode, bool is_key);
@@ -742,7 +897,7 @@
   // Adds a frame to be output in the file. The frame is written out through
   // |writer_| if successful. Returns true on success.
   // Inputs:
-  //   frame: Pointer to the data.
+  //   data: Pointer to the data.
   //   length: Length of the data.
   //   discard_padding: DiscardPadding element value.
   //   track_number: Track to add the data to. Value returned by Add track
@@ -750,14 +905,14 @@
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
                                   int64 discard_padding, uint64 track_number,
                                   uint64 abs_timecode, bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
   // Inputs:
-  //   frame: Pointer to the data
+  //   data: Pointer to the data
   //   length: Length of the data
   //   track_number: Track to add the data to. Value returned by Add track
   //                 functions.  The range of allowed values is [1, 126].
@@ -768,7 +923,7 @@
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* frame, uint64 length, uint64 track_number,
+  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
                    uint64 timecode, uint64 duration);
 
   // Increments the size of the cluster's data in bytes.
@@ -781,75 +936,29 @@
   // Returns the size in bytes for the entire Cluster element.
   uint64 Size() const;
 
+  // Given |abs_timecode|, calculates timecode relative to most recent timecode.
+  // Returns -1 on failure, or a relative timecode.
+  int64 GetRelativeTimecode(int64 abs_timecode) const;
+
   int64 size_position() const { return size_position_; }
   int32 blocks_added() const { return blocks_added_; }
   uint64 payload_size() const { return payload_size_; }
   int64 position_for_cues() const { return position_for_cues_; }
   uint64 timecode() const { return timecode_; }
+  uint64 timecode_scale() const { return timecode_scale_; }
 
  private:
-  //  Signature that matches either of WriteSimpleBlock or WriteMetadataBlock
-  //  in the muxer utilities package.
-  typedef uint64 (*WriteBlock)(IMkvWriter* writer, const uint8* data,
-                               uint64 length, uint64 track_number,
-                               int64 timecode, uint64 generic_arg);
-
-  //  Signature that matches WriteBlockWithAdditional
-  //  in the muxer utilities package.
-  typedef uint64 (*WriteBlockAdditional)(IMkvWriter* writer, const uint8* data,
-                                         uint64 length, const uint8* additional,
-                                         uint64 add_id,
-                                         uint64 additional_length,
-                                         uint64 track_number, int64 timecode,
-                                         uint64 is_key);
-
-  //  Signature that matches WriteBlockWithDiscardPadding
-  //  in the muxer utilities package.
-  typedef uint64 (*WriteBlockDiscardPadding)(IMkvWriter* writer,
-                                             const uint8* data, uint64 length,
-                                             int64 discard_padding,
-                                             uint64 track_number,
-                                             int64 timecode, uint64 is_key);
-
   // Utility method that confirms that blocks can still be added, and that the
-  // cluster header has been written. Used by |DoWriteBlock*|. Returns true
+  // cluster header has been written. Used by |DoWriteFrame*|. Returns true
   // when successful.
-  template <typename Type>
-  bool PreWriteBlock(Type* write_function);
+  bool PreWriteBlock();
 
-  // Utility method used by the |DoWriteBlock*| methods that handles the book
+  // Utility method used by the |DoWriteFrame*| methods that handles the book
   // keeping required after each block is written.
   void PostWriteBlock(uint64 element_size);
 
-  // To simplify things, we require that there be fewer than 127
-  // tracks -- this allows us to serialize the track number value for
-  // a stream using a single byte, per the Matroska encoding.
-  bool IsValidTrackNumber(uint64 track_number) const;
-
-  // Given |abs_timecode|, calculates timecode relative to most recent timecode.
-  // Returns -1 on failure, or a relative timecode.
-  int64 GetRelativeTimecode(int64 abs_timecode) const;
-
-  //  Used to implement AddFrame and AddMetadata.
-  bool DoWriteBlock(const uint8* frame, uint64 length, uint64 track_number,
-                    uint64 absolute_timecode, uint64 generic_arg,
-                    WriteBlock write_block);
-
-  // Used to implement AddFrameWithAdditional
-  bool DoWriteBlockWithAdditional(const uint8* frame, uint64 length,
-                                  const uint8* additional,
-                                  uint64 additional_length, uint64 add_id,
-                                  uint64 track_number, uint64 absolute_timecode,
-                                  uint64 generic_arg,
-                                  WriteBlockAdditional write_block);
-
-  // Used to implement AddFrameWithDiscardPadding
-  bool DoWriteBlockWithDiscardPadding(const uint8* frame, uint64 length,
-                                      int64 discard_padding,
-                                      uint64 track_number,
-                                      uint64 absolute_timecode,
-                                      uint64 generic_arg,
-                                      WriteBlockDiscardPadding write_block);
+  // Does some verification and calls WriteFrame.
+  bool DoWriteFrame(const Frame* const frame);
 
   // Outputs the Cluster header to |writer_|. Returns true on success.
   bool WriteClusterHeader();
@@ -875,6 +984,9 @@
   // The absolute timecode of the cluster.
   const uint64 timecode_;
 
+  // The timecode scale of the Segment containing the cluster.
+  const uint64 timecode_scale_;
+
   // Pointer to the writer object. Not owned by this class.
   IMkvWriter* writer_;
 
@@ -996,6 +1108,7 @@
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
+  const static uint32 kDefaultDocTypeVersion = 2;
   const static uint64 kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
@@ -1023,6 +1136,11 @@
   // populate its fields via the Chapter member functions.
   Chapter* AddChapter();
 
+  // Adds an empty tag to the tags of this segment.  Returns
+  // non-NULL on success.  After adding the tag, the caller should
+  // populate its fields via the Tag member functions.
+  Tag* AddTag();
+
   // Adds a cue point to the Cues element. |timestamp| is the time in
   // nanoseconds of the cue's time. |track| is the Track of the Cue. This
   // function must be called after AddFrame to calculate the correct
@@ -1031,19 +1149,19 @@
 
   // Adds a frame to be output in the file. Returns true on success.
   // Inputs:
-  //   frame: Pointer to the data
+  //   data: Pointer to the data
   //   length: Length of the data
   //   track_number: Track to add the data to. Value returned by Add track
   //                 functions.
   //   timestamp:    Timestamp of the frame in nanoseconds from 0.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
                 uint64 timestamp_ns, bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
   // Inputs:
-  //   frame: Pointer to the data
+  //   data: Pointer to the data
   //   length: Length of the data
   //   track_number: Track to add the data to. Value returned by Add track
   //                 functions.
@@ -1054,13 +1172,13 @@
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* frame, uint64 length, uint64 track_number,
+  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
                    uint64 timestamp_ns, uint64 duration_ns);
 
   // Writes a frame with additional data to the output medium; returns true on
   // success.
   // Inputs:
-  //   frame: Pointer to the data.
+  //   data: Pointer to the data.
   //   length: Length of the data.
   //   additional: Pointer to additional data.
   //   additional_length: Length of additional data.
@@ -1070,7 +1188,7 @@
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* frame, uint64 length,
+  bool AddFrameWithAdditional(const uint8* data, uint64 length,
                               const uint8* additional, uint64 additional_length,
                               uint64 add_id, uint64 track_number,
                               uint64 timestamp, bool is_key);
@@ -1078,7 +1196,7 @@
   // Writes a frame with DiscardPadding to the output medium; returns true on
   // success.
   // Inputs:
-  //   frame: Pointer to the data.
+  //   data: Pointer to the data.
   //   length: Length of the data.
   //   discard_padding: DiscardPadding element value.
   //   track_number: Track to add the data to. Value returned by Add track
@@ -1086,7 +1204,7 @@
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
                                   int64 discard_padding, uint64 track_number,
                                   uint64 timestamp, bool is_key);
 
@@ -1177,6 +1295,9 @@
   // Cues elements.
   bool CheckHeaderInfo();
 
+  // Sets |doc_type_version_| based on the current element requirements.
+  void UpdateDocTypeVersion();
+
   // Sets |name| according to how many chunks have been written. |ext| is the
   // file extension. |name| must be deleted by the calling app. Returns true
   // on success.
@@ -1233,7 +1354,7 @@
   // diff - indicates the difference in size of the Cues element that needs to
   //        accounted for.
   // index - index in the list of Cues which is currently being adjusted.
-  // cue_size - size of the Cues element.
+  // cue_size - sum of size of all the CuePoint elements.
   void MoveCuesBeforeClustersHelper(uint64 diff, int index, uint64* cue_size);
 
   // Seeds the random number generator used to make UIDs.
@@ -1245,6 +1366,7 @@
   SegmentInfo segment_info_;
   Tracks tracks_;
   Chapters chapters_;
+  Tags tags_;
 
   // Number of chunks written.
   int chunk_count_;
@@ -1316,6 +1438,9 @@
   // Last timestamp in nanoseconds added to a cluster.
   uint64 last_timestamp_;
 
+  // Last timestamp in nanoseconds by track number added to a cluster.
+  uint64 last_track_timestamp_[kMaxTrackNumber];
+
   // Maximum time in nanoseconds for a cluster duration. This variable is a
   // guideline and some clusters may have a longer duration. Default is 30
   // seconds.
@@ -1337,12 +1462,23 @@
   // Flag whether or not the muxer should output a Cues element.
   bool output_cues_;
 
+  // The size of the EBML header, used to validate the header if
+  // WriteEbmlHeader() is called more than once.
+  int32 ebml_header_size_;
+
   // The file position of the segment's payload.
   int64 payload_pos_;
 
   // The file position of the element's size.
   int64 size_position_;
 
+  // Current DocTypeVersion (|doc_type_version_|) and that written in
+  // WriteSegmentHeader().
+  // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
+  // differs from |doc_type_version_written_|.
+  uint32 doc_type_version_;
+  uint32 doc_type_version_written_;
+
   // Pointer to the writer objects. Not owned by this class.
   IMkvWriter* writer_cluster_;
   IMkvWriter* writer_cues_;

diff --git a/libvpx/third_party/libwebm/mkvmuxerutil.cpp b/libvpx/third_party/libwebm/mkvmuxerutil.cpp
index 3fb9bc9..27ab15d 100644
--- a/libvpx/third_party/libwebm/mkvmuxerutil.cpp
+++ b/libvpx/third_party/libwebm/mkvmuxerutil.cpp

@@ -15,18 +15,19 @@
 #include <cassert>
 #include <cmath>
 #include <cstdio>
-#ifdef _MSC_VER
-#define _CRT_RAND_S
-#endif
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
-
 #include <new>
 
 #include "mkvwriter.hpp"
 #include "webmids.hpp"
 
+#ifdef _MSC_VER
+// Disable MSVC warnings that suggest making code non-portable.
+#pragma warning(disable : 4996)
+#endif
+
 namespace mkvmuxer {
 
 namespace {
@@ -34,6 +35,144 @@
 // Date elements are always 8 octets in size.
 const int kDateElementSize = 8;
 
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
+  if (frame->additional()) {
+    block_additional_elem_size = EbmlElementSize(
+        kMkvBlockAdditional, frame->additional(), frame->additional_length());
+    block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, frame->add_id());
+
+    block_more_payload_size =
+        block_addid_elem_size + block_additional_elem_size;
+    block_more_elem_size =
+        EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
+        block_more_payload_size;
+    block_additions_payload_size = block_more_elem_size;
+    block_additions_elem_size =
+        EbmlMasterElementSize(kMkvBlockAdditions,
+                              block_additions_payload_size) +
+        block_additions_payload_size;
+  }
+
+  uint64 discard_padding_elem_size = 0;
+  if (frame->discard_padding() != 0) {
+    discard_padding_elem_size =
+        EbmlElementSize(kMkvDiscardPadding, frame->discard_padding());
+  }
+
+  const uint64 reference_block_timestamp =
+      frame->reference_block_timestamp() / timecode_scale;
+  uint64 reference_block_elem_size = 0;
+  if (!frame->is_key()) {
+    reference_block_elem_size =
+        EbmlElementSize(kMkvReferenceBlock, reference_block_timestamp);
+  }
+
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
+  if (duration > 0)
+    block_duration_elem_size = EbmlElementSize(kMkvBlockDuration, duration);
+
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
+
+  const uint64 block_group_payload_size =
+      block_elem_size + block_additions_elem_size + block_duration_elem_size +
+      discard_padding_elem_size + reference_block_elem_size;
+
+  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup,
+                              block_group_payload_size)) {
+    return 0;
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
+    return 0;
+
+  if (WriteUInt(writer, frame->track_number()))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  // For a Block, flags is always 0.
+  if (SerializeInt(writer, 0, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  if (frame->additional()) {
+    if (!WriteEbmlMasterElement(writer, kMkvBlockAdditions,
+                                block_additions_payload_size)) {
+      return 0;
+    }
+
+    if (!WriteEbmlMasterElement(writer, kMkvBlockMore, block_more_payload_size))
+      return 0;
+
+    if (!WriteEbmlElement(writer, kMkvBlockAddID, frame->add_id()))
+      return 0;
+
+    if (!WriteEbmlElement(writer, kMkvBlockAdditional, frame->additional(),
+                          frame->additional_length())) {
+      return 0;
+    }
+  }
+
+  if (frame->discard_padding() != 0 &&
+      !WriteEbmlElement(writer, kMkvDiscardPadding, frame->discard_padding())) {
+    return false;
+  }
+
+  if (!frame->is_key() &&
+      !WriteEbmlElement(writer, kMkvReferenceBlock,
+                        reference_block_timestamp)) {
+    return false;
+  }
+
+  if (duration > 0 && !WriteEbmlElement(writer, kMkvBlockDuration, duration)) {
+    return false;
+  }
+  return EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
+         block_group_payload_size;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
+  if (WriteID(writer, kMkvSimpleBlock))
+    return 0;
+
+  const int32 size = static_cast<int32>(frame->length()) + 4;
+  if (WriteUInt(writer, size))
+    return 0;
+
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  uint64 flags = 0;
+  if (frame->is_key())
+    flags |= 0x80;
+
+  if (SerializeInt(writer, flags, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  return GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
+}
+
 }  // namespace
 
 int32 GetCodedUIntSize(uint64 value) {
@@ -72,6 +211,13 @@
   return 8;
 }
 
+int32 GetIntSize(int64 value) {
+  // Doubling the requested value ensures positive values with their high bit
+  // set are written with 0-padding to avoid flipping the signedness.
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
+  return GetUIntSize(2 * v);
+}
+
 uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
   // Size of EBML ID
   int32 ebml_size = GetUIntSize(type);
@@ -83,7 +229,16 @@
 }
 
 uint64 EbmlElementSize(uint64 type, int64 value) {
-  return EbmlElementSize(type, static_cast<uint64>(value));
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
 }
 
 uint64 EbmlElementSize(uint64 type, uint64 value) {
@@ -144,7 +299,7 @@
   return ebml_size;
 }
 
-uint64 EbmlDateElementSize(uint64 type, int64 value) {
+uint64 EbmlDateElementSize(uint64 type) {
   // Size of EBML ID
   uint64 ebml_size = GetUIntSize(type);
 
@@ -289,6 +444,23 @@
   return true;
 }
 
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return 0;
+
+  const uint64 size = GetIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
   if (!writer)
     return false;
@@ -355,289 +527,25 @@
   return true;
 }
 
-uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                        uint64 track_number, int64 timecode, uint64 is_key) {
-  if (!writer)
-    return false;
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
+  if (!writer || !frame || !frame->IsValid() || !cluster ||
+      !cluster->timecode_scale())
+    return 0;
 
-  if (!data || length < 1)
-    return false;
-
-  //  Here we only permit track number values to be no greater than
-  //  126, which the largest value we can store having a Matroska
-  //  integer representation of only 1 byte.
-
-  if (track_number < 1 || track_number > 126)
-    return false;
-
-  //  Technically the timestamp for a block can be less than the
-  //  timestamp for the cluster itself (remember that block timestamp
+  //  Technically the timecode for a block can be less than the
+  //  timecode for the cluster itself (remember that block timecode
   //  is a signed, 16-bit integer).  However, as a simplification we
-  //  only permit non-negative cluster-relative timestamps for blocks.
-
-  if (timecode < 0 || timecode > kMaxBlockTimecode)
-    return false;
-
-  if (WriteID(writer, kMkvSimpleBlock))
+  //  only permit non-negative cluster-relative timecodes for blocks.
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
+      frame->timestamp() / cluster->timecode_scale());
+  if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
 
-  const int32 size = static_cast<int32>(length) + 4;
-  if (WriteUInt(writer, size))
-    return 0;
-
-  if (WriteUInt(writer, static_cast<uint64>(track_number)))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  const uint64 element_size =
-      GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + length;
-
-  return element_size;
-}
-
-// We must write the metadata (key)frame as a BlockGroup element,
-// because we need to specify a duration for the frame.  The
-// BlockGroup element comprises the frame itself and its duration,
-// and is laid out as follows:
-//
-//   BlockGroup tag
-//   BlockGroup size
-//     Block tag
-//     Block size
-//     (the frame is the block payload)
-//     Duration tag
-//     Duration size
-//     (duration payload)
-//
-uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                          uint64 track_number, int64 timecode,
-                          uint64 duration) {
-  // We don't backtrack when writing to the stream, so we must
-  // pre-compute the BlockGroup size, by summing the sizes of each
-  // sub-element (the block and the duration).
-
-  // We use a single byte for the track number of the block, which
-  // means the block header is exactly 4 bytes.
-
-  // TODO(matthewjheaney): use EbmlMasterElementSize and WriteEbmlMasterElement
-
-  const uint64 block_payload_size = 4 + length;
-  const int32 block_size = GetCodedUIntSize(block_payload_size);
-  const uint64 block_elem_size = 1 + block_size + block_payload_size;
-
-  const int32 duration_payload_size = GetUIntSize(duration);
-  const int32 duration_size = GetCodedUIntSize(duration_payload_size);
-  const uint64 duration_elem_size = 1 + duration_size + duration_payload_size;
-
-  const uint64 blockg_payload_size = block_elem_size + duration_elem_size;
-  const int32 blockg_size = GetCodedUIntSize(blockg_payload_size);
-  const uint64 blockg_elem_size = 1 + blockg_size + blockg_payload_size;
-
-  if (WriteID(writer, kMkvBlockGroup))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, blockg_payload_size))
-    return 0;
-
-  //  Write Block element
-
-  if (WriteID(writer, kMkvBlock))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, block_payload_size))
-    return 0;
-
-  // Byte 1 of 4
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  // Bytes 2 & 3 of 4
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  // Byte 4 of 4
-
-  const uint64 flags = 0;
-
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  // Now write the actual frame (of metadata)
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  // Write Duration element
-
-  if (WriteID(writer, kMkvBlockDuration))  // 1-byte ID size
-    return 0;
-
-  if (WriteUInt(writer, duration_payload_size))
-    return 0;
-
-  if (SerializeInt(writer, duration, duration_payload_size))
-    return 0;
-
-  // Note that we don't write a reference time as part of the block
-  // group; no reference time(s) indicates that this block is a
-  // keyframe.  (Unlike the case for a SimpleBlock element, the header
-  // bits of the Block sub-element of a BlockGroup element do not
-  // indicate keyframe status.  The keyframe status is inferred from
-  // the absence of reference time sub-elements.)
-
-  return blockg_elem_size;
-}
-
-// Writes a WebM BlockGroup with BlockAdditional data. The structure is as
-// follows:
-// Indentation shows sub-levels
-// BlockGroup
-//  Block
-//    Data
-//  BlockAdditions
-//    BlockMore
-//      BlockAddID
-//        1 (Denotes Alpha)
-//      BlockAdditional
-//        Data
-uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
-                                uint64 length, const uint8* additional,
-                                uint64 additional_length, uint64 add_id,
-                                uint64 track_number, int64 timecode,
-                                uint64 is_key) {
-  if (!data || !additional || length < 1 || additional_length < 1)
-    return 0;
-
-  const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size =
-      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
-  const uint64 block_additional_elem_size =
-      EbmlElementSize(kMkvBlockAdditional, additional, additional_length);
-  const uint64 block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, add_id);
-
-  const uint64 block_more_payload_size =
-      block_addid_elem_size + block_additional_elem_size;
-  const uint64 block_more_elem_size =
-      EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
-      block_more_payload_size;
-  const uint64 block_additions_payload_size = block_more_elem_size;
-  const uint64 block_additions_elem_size =
-      EbmlMasterElementSize(kMkvBlockAdditions, block_additions_payload_size) +
-      block_additions_payload_size;
-  const uint64 block_group_payload_size =
-      block_elem_size + block_additions_elem_size;
-  const uint64 block_group_elem_size =
-      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
-      block_group_payload_size;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
-    return 0;
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockAdditions,
-                              block_additions_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockMore, block_more_payload_size))
-    return 0;
-
-  if (!WriteEbmlElement(writer, kMkvBlockAddID, add_id))
-    return 0;
-
-  if (!WriteEbmlElement(writer, kMkvBlockAdditional, additional,
-                        additional_length))
-    return 0;
-
-  return block_group_elem_size;
-}
-
-// Writes a WebM BlockGroup with DiscardPadding. The structure is as follows:
-// Indentation shows sub-levels
-// BlockGroup
-//  Block
-//    Data
-//  DiscardPadding
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
-                                    uint64 length, int64 discard_padding,
-                                    uint64 track_number, int64 timecode,
-                                    uint64 is_key) {
-  if (!data || length < 1 || discard_padding <= 0)
-    return 0;
-
-  const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size =
-      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
-  const uint64 discard_padding_elem_size =
-      EbmlElementSize(kMkvDiscardPadding, discard_padding);
-  const uint64 block_group_payload_size =
-      block_elem_size + discard_padding_elem_size;
-  const uint64 block_group_elem_size =
-      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
-      block_group_payload_size;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
-    return 0;
-
-  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
-    return 0;
-
-  if (WriteUInt(writer, track_number))
-    return 0;
-
-  if (SerializeInt(writer, timecode, 2))
-    return 0;
-
-  uint64 flags = 0;
-  if (is_key)
-    flags |= 0x80;
-  if (SerializeInt(writer, flags, 1))
-    return 0;
-
-  if (writer->Write(data, static_cast<uint32>(length)))
-    return 0;
-
-  if (WriteID(writer, kMkvDiscardPadding))
-    return 0;
-
-  const uint64 size = GetUIntSize(discard_padding);
-  if (WriteUInt(writer, size))
-    return false;
-
-  if (SerializeInt(writer, discard_padding, static_cast<int32>(size)))
-    return false;
-
-  return block_group_elem_size;
+  return frame->CanBeSimpleBlock() ?
+             WriteSimpleBlock(writer, frame, relative_timecode) :
+             WriteBlock(writer, frame, relative_timecode,
+                        cluster->timecode_scale());
 }
 
 uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
@@ -698,10 +606,7 @@
 // TODO(fgalligan): Move random number generation to platform specific code.
 #ifdef _MSC_VER
     (void)seed;
-    unsigned int random_value;
-    const errno_t e = rand_s(&random_value);
-    (void)e;
-    const int32 nn = random_value;
+    const int32 nn = rand();
 #elif __ANDROID__
     int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);

diff --git a/libvpx/third_party/libwebm/mkvmuxerutil.hpp b/libvpx/third_party/libwebm/mkvmuxerutil.hpp
index a092abe..e318576 100644
--- a/libvpx/third_party/libwebm/mkvmuxerutil.hpp
+++ b/libvpx/third_party/libwebm/mkvmuxerutil.hpp

@@ -9,6 +9,7 @@
 #ifndef MKVMUXERUTIL_HPP
 #define MKVMUXERUTIL_HPP
 
+#include "mkvmuxer.hpp"
 #include "mkvmuxertypes.hpp"
 
 namespace mkvmuxer {
@@ -23,6 +24,7 @@
 
 // Returns the size in bytes of the element.
 int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
 int32 GetCodedUIntSize(uint64 value);
 uint64 EbmlMasterElementSize(uint64 type, uint64 value);
 uint64 EbmlElementSize(uint64 type, int64 value);
@@ -30,7 +32,7 @@
 uint64 EbmlElementSize(uint64 type, float value);
 uint64 EbmlElementSize(uint64 type, const char* value);
 uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
-uint64 EbmlDateElementSize(uint64 type, int64 value);
+uint64 EbmlDateElementSize(uint64 type);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |value|. |value| must not
@@ -51,73 +53,17 @@
 
 // Output an Mkv non-master element. Returns true if the element was written.
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
                       uint64 size);
 bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
 
-// Output an Mkv Simple Block.
-// Inputs:
-//   data:         Pointer to the data.
-//   length:       Length of the data.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode:     Relative timecode of the Block.  Only values in the
-//                  range [0, 2^15) are permitted.
-//   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                        uint64 track_number, int64 timecode, uint64 is_key);
-
-// Output a metadata keyframe, using a Block Group element.
-// Inputs:
-//   data:         Pointer to the (meta)data.
-//   length:       Length of the (meta)data.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode      Timecode of frame, relative to cluster timecode.  Only
-//                  values in the range [0, 2^15) are permitted.
-//   duration_timecode  Duration of frame, using timecode units.
-uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
-                          uint64 track_number, int64 timecode,
-                          uint64 duration_timecode);
-
-// Output an Mkv Block with BlockAdditional data.
-// Inputs:
-//   data:         Pointer to the data.
-//   length:       Length of the data.
-//   additional:   Pointer to the additional data
-//   additional_length: Length of the additional data.
-//   add_id: Value of BlockAddID element.
-//   track_number: Track to add the data to. Value returned by Add track
-//                  functions.  Only values in the range [1, 126] are
-//                  permitted.
-//   timecode:     Relative timecode of the Block.  Only values in the
-//                  range [0, 2^15) are permitted.
-//   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
-                                uint64 length, const uint8* additional,
-                                uint64 additional_length, uint64 add_id,
-                                uint64 track_number, int64 timecode,
-                                uint64 is_key);
-
-// Output an Mkv Block with a DiscardPadding element.
-// Inputs:
-//   data:            Pointer to the data.
-//   length:          Length of the data.
-//   discard_padding: DiscardPadding value.
-//   track_number:    Track to add the data to. Value returned by Add track
-//                    functions. Only values in the range [1, 126] are
-//                    permitted.
-//   timecode:        Relative timecode of the Block.  Only values in the
-//                    range [0, 2^15) are permitted.
-//   is_key:          Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
-                                    uint64 length, int64 discard_padding,
-                                    uint64 track_number, int64 timecode,
-                                    uint64 is_key);
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
 
 // Output a void element. |size| must be the entire size in bytes that will be
 // void. The function will calculate the size of the void header and subtract

diff --git a/libvpx/third_party/libwebm/mkvparser.cpp b/libvpx/third_party/libwebm/mkvparser.cpp
index 441f165..fc01be5 100644
--- a/libvpx/third_party/libwebm/mkvparser.cpp
+++ b/libvpx/third_party/libwebm/mkvparser.cpp

@@ -23,7 +23,7 @@
   major = 1;
   minor = 0;
   build = 0;
-  revision = 28;
+  revision = 30;
 }
 
 long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -130,6 +130,8 @@
   return 0;  // success
 }
 
+// TODO(vigneshv): This function assumes that unsigned values never have their
+// high bit set.
 long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
                                      long long size) {
   assert(pReader);
@@ -217,8 +219,8 @@
   return 0;
 }
 
-long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos, long size,
-                               long long& result) {
+long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos,
+                               long long size, long long& result) {
   assert(pReader);
   assert(pos >= 0);
   assert(size > 0);
@@ -605,6 +607,7 @@
       m_pTracks(NULL),
       m_pCues(NULL),
       m_pChapters(NULL),
+      m_pTags(NULL),
       m_clusters(NULL),
       m_clusterCount(0),
       m_clusterPreloadCount(0),
@@ -629,6 +632,7 @@
   delete m_pInfo;
   delete m_pCues;
   delete m_pChapters;
+  delete m_pTags;
   delete m_pSeekHead;
 }
 
@@ -907,6 +911,19 @@
         if (status)
           return status;
       }
+    } else if (id == 0x0254C367) {  // Tags ID
+      if (m_pTags == NULL) {
+        m_pTags = new (std::nothrow)
+            Tags(this, pos, size, element_start, element_size);
+
+        if (m_pTags == NULL)
+          return -1;
+
+        const long status = m_pTags->Parse();
+
+        if (status)
+          return status;
+      }
     }
 
     m_pos = pos + size;  // consume payload
@@ -1025,23 +1042,11 @@
 
     const long long unknown_size = (1LL << (7 * len)) - 1;
 
-#if 0  // we must handle this to support live webm
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;  //TODO: allow this
-#endif
-
     if ((segment_stop >= 0) && (size != unknown_size) &&
         ((pos + size) > segment_stop)) {
       return E_FILE_FORMAT_INVALID;
     }
 
-#if 0  // commented-out, to support incremental cluster parsing
-        len = static_cast<long>(size);
-
-        if ((pos + size) > avail)
-            return E_BUFFER_NOT_FULL;
-#endif
-
     if (id == 0x0C53BB6B) {  // Cues ID
       if (size == unknown_size)
         return E_FILE_FORMAT_INVALID;  // TODO: liberalize
@@ -1157,10 +1162,8 @@
   }
 
   if (status == 0) {  // no entries found
-    if (cluster_size < 0)
-      return E_FILE_FORMAT_INVALID;  // TODO: handle this
-
-    pos += cluster_size;
+    if (cluster_size >= 0)
+      pos += cluster_size;
 
     if ((total >= 0) && (pos >= total)) {
       m_pos = total;
@@ -1201,306 +1204,15 @@
 
   return 0;  // partial success, since we have a new cluster
 
-// status == 0 means "no block entries found"
-
-// pos designates start of payload
-// m_pos has NOT been adjusted yet (in case we need to come back here)
-
-#if 0
-
-    if (cluster_size < 0) {  //unknown size
-        const long long payload_pos = pos;  //absolute pos of cluster payload
-
-        for (;;) {  //determine cluster size
-            if ((total >= 0) && (pos >= total))
-                break;
-
-            if ((segment_stop >= 0) && (pos >= segment_stop))
-                break;  //no more clusters
-
-            //Read ID
-
-            if ((pos + 1) > avail)
-            {
-                len = 1;
-                return E_BUFFER_NOT_FULL;
-            }
-
-            long long result = GetUIntLength(m_pReader, pos, len);
-
-            if (result < 0)  //error
-                return static_cast<long>(result);
-
-            if (result > 0)  //weird
-                return E_BUFFER_NOT_FULL;
-
-            if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-                return E_FILE_FORMAT_INVALID;
-
-            if ((pos + len) > avail)
-                return E_BUFFER_NOT_FULL;
-
-            const long long idpos = pos;
-            const long long id = ReadUInt(m_pReader, idpos, len);
-
-            if (id < 0)  //error (or underflow)
-                return static_cast<long>(id);
-
-            //This is the distinguished set of ID's we use to determine
-            //that we have exhausted the sub-element's inside the cluster
-            //whose ID we parsed earlier.
-
-            if (id == 0x0F43B675)  //Cluster ID
-                break;
-
-            if (id == 0x0C53BB6B)  //Cues ID
-                break;
-
-            switch (id)
-            {
-                case 0x20:  //BlockGroup
-                case 0x23:  //Simple Block
-                case 0x67:  //TimeCode
-                case 0x2B:  //PrevSize
-                    break;
-
-                default:
-                    assert(false);
-                    break;
-            }
-
-            pos += len;  //consume ID (of sub-element)
-
-            //Read Size
-
-            if ((pos + 1) > avail)
-            {
-                len = 1;
-                return E_BUFFER_NOT_FULL;
-            }
-
-            result = GetUIntLength(m_pReader, pos, len);
-
-            if (result < 0)  //error
-                return static_cast<long>(result);
-
-            if (result > 0)  //weird
-                return E_BUFFER_NOT_FULL;
-
-            if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-                return E_FILE_FORMAT_INVALID;
-
-            if ((pos + len) > avail)
-                return E_BUFFER_NOT_FULL;
-
-            const long long size = ReadUInt(m_pReader, pos, len);
-
-            if (size < 0)  //error
-                return static_cast<long>(size);
-
-            pos += len;  //consume size field of element
-
-            //pos now points to start of sub-element's payload
-
-            if (size == 0)  //weird
-                continue;
-
-            const long long unknown_size = (1LL << (7 * len)) - 1;
-
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;  //not allowed for sub-elements
-
-            if ((segment_stop >= 0) && ((pos + size) > segment_stop))  //weird
-                return E_FILE_FORMAT_INVALID;
-
-            pos += size;  //consume payload of sub-element
-            assert((segment_stop < 0) || (pos <= segment_stop));
-        }  //determine cluster size
-
-        cluster_size = pos - payload_pos;
-        assert(cluster_size >= 0);
-
-        pos = payload_pos;  //reset and re-parse original cluster
-    }
-
-    if (m_clusterPreloadCount > 0)
-    {
-        assert(idx < m_clusterSize);
-
-        Cluster* const pCluster = m_clusters[idx];
-        assert(pCluster);
-        assert(pCluster->m_index < 0);
-
-        const long long off = pCluster->GetPosition();
-        assert(off >= 0);
-
-        if (off == cluster_off)  //preloaded already
-            return E_FILE_FORMAT_INVALID;  //subtle
-    }
-
-    m_pos = pos + cluster_size;  //consume payload
-    assert((segment_stop < 0) || (m_pos <= segment_stop));
-
-    return 2;     //try to find another cluster
-
-#endif
+  // status == 0 means "no block entries found"
+  // pos designates start of payload
+  // m_pos has NOT been adjusted yet (in case we need to come back here)
 }
 
 long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
   assert(m_pos < 0);
   assert(m_pUnknownSize);
 
-#if 0
-    assert(m_pUnknownSize->GetElementSize() < 0);  //TODO: verify this
-
-    const long long element_start = m_pUnknownSize->m_element_start;
-
-    pos = -m_pos;
-    assert(pos > element_start);
-
-    //We have already consumed the (cluster) ID and size fields.
-    //We just need to consume the blocks and other sub-elements
-    //of this cluster, until we discover the boundary.
-
-    long long total, avail;
-
-    long status = m_pReader->Length(&total, &avail);
-
-    if (status < 0)  //error
-        return status;
-
-    assert((total < 0) || (avail <= total));
-
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
-
-    long long element_size = -1;
-
-    for (;;) {  //determine cluster size
-        if ((total >= 0) && (pos >= total))
-        {
-            element_size = total - element_start;
-            assert(element_size > 0);
-
-            break;
-        }
-
-        if ((segment_stop >= 0) && (pos >= segment_stop))
-        {
-            element_size = segment_stop - element_start;
-            assert(element_size > 0);
-
-            break;
-        }
-
-        //Read ID
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long long result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long idpos = pos;
-        const long long id = ReadUInt(m_pReader, idpos, len);
-
-        if (id < 0)  //error (or underflow)
-            return static_cast<long>(id);
-
-        //This is the distinguished set of ID's we use to determine
-        //that we have exhausted the sub-element's inside the cluster
-        //whose ID we parsed earlier.
-
-        if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) {  //Cluster ID or Cues ID
-            element_size = pos - element_start;
-            assert(element_size > 0);
-
-            break;
-        }
-
-#ifdef _DEBUG
-        switch (id)
-        {
-            case 0x20:  //BlockGroup
-            case 0x23:  //Simple Block
-            case 0x67:  //TimeCode
-            case 0x2B:  //PrevSize
-                break;
-
-            default:
-                assert(false);
-                break;
-        }
-#endif
-
-        pos += len;  //consume ID (of sub-element)
-
-        //Read Size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        pos += len;  //consume size field of element
-
-        //pos now points to start of sub-element's payload
-
-        if (size == 0)  //weird
-            continue;
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;  //not allowed for sub-elements
-
-        if ((segment_stop >= 0) && ((pos + size) > segment_stop))  //weird
-            return E_FILE_FORMAT_INVALID;
-
-        pos += size;  //consume payload of sub-element
-        assert((segment_stop < 0) || (pos <= segment_stop));
-    }  //determine cluster size
-
-    assert(element_size >= 0);
-
-    m_pos = element_start + element_size;
-    m_pUnknownSize = 0;
-
-    return 2;  //continue parsing
-#else
   const long status = m_pUnknownSize->Parse(pos, len);
 
   if (status < 0)  // error or underflow
@@ -1522,7 +1234,6 @@
   m_pUnknownSize = 0;
 
   return 2;  // continue parsing
-#endif
 }
 
 void Segment::AppendCluster(Cluster* pCluster) {
@@ -1540,7 +1251,7 @@
   if (count >= size) {
     const long n = (size <= 0) ? 2048 : 2 * size;
 
-    Cluster** const qq = new Cluster* [n];
+    Cluster** const qq = new Cluster*[n];
     Cluster** q = qq;
 
     Cluster** p = m_clusters;
@@ -1594,7 +1305,7 @@
   if (count >= size) {
     const long n = (size <= 0) ? 2048 : 2 * size;
 
-    Cluster** const qq = new Cluster* [n];
+    Cluster** const qq = new Cluster*[n];
     Cluster** q = qq;
 
     Cluster** p = m_clusters;
@@ -1794,55 +1505,6 @@
   return m_void_elements + idx;
 }
 
-#if 0
-void Segment::ParseCues(long long off)
-{
-    if (m_pCues)
-        return;
-
-    //odbgstream os;
-    //os << "Segment::ParseCues (begin)" << endl;
-
-    long long pos = m_start + off;
-    const long long element_start = pos;
-    const long long stop = m_start + m_size;
-
-    long len;
-
-    long long result = GetUIntLength(m_pReader, pos, len);
-    assert(result == 0);
-    assert((pos + len) <= stop);
-
-    const long long idpos = pos;
-
-    const long long id = ReadUInt(m_pReader, idpos, len);
-    assert(id == 0x0C53BB6B);  //Cues ID
-
-    pos += len;  //consume ID
-    assert(pos < stop);
-
-    //Read Size
-
-    result = GetUIntLength(m_pReader, pos, len);
-    assert(result == 0);
-    assert((pos + len) <= stop);
-
-    const long long size = ReadUInt(m_pReader, pos, len);
-    assert(size >= 0);
-
-    pos += len;  //consume length of size of element
-    assert((pos + size) <= stop);
-
-    const long long element_size = size + pos - element_start;
-
-    //Pos now points to start of payload
-
-    m_pCues = new Cues(this, pos, size, element_start, element_size);
-    assert(m_pCues);  //TODO
-
-    //os << "Segment::ParseCues (end)" << endl;
-}
-#else
 long Segment::ParseCues(long long off, long long& pos, long& len) {
   if (m_pCues)
     return 0;  // success
@@ -1957,67 +1619,7 @@
 
   return 0;  // success
 }
-#endif
 
-#if 0
-void Segment::ParseSeekEntry(
-    long long start,
-    long long size_)
-{
-    long long pos = start;
-
-    const long long stop = start + size_;
-
-    long len;
-
-    const long long seekIdId = ReadUInt(m_pReader, pos, len);
-    //seekIdId;
-    assert(seekIdId == 0x13AB);  //SeekID ID
-    assert((pos + len) <= stop);
-
-    pos += len;  //consume id
-
-    const long long seekIdSize = ReadUInt(m_pReader, pos, len);
-    assert(seekIdSize >= 0);
-    assert((pos + len) <= stop);
-
-    pos += len;  //consume size
-
-    const long long seekId = ReadUInt(m_pReader, pos, len);  //payload
-    assert(seekId >= 0);
-    assert(len == seekIdSize);
-    assert((pos + len) <= stop);
-
-    pos += seekIdSize;  //consume payload
-
-    const long long seekPosId = ReadUInt(m_pReader, pos, len);
-    //seekPosId;
-    assert(seekPosId == 0x13AC);  //SeekPos ID
-    assert((pos + len) <= stop);
-
-    pos += len;  //consume id
-
-    const long long seekPosSize = ReadUInt(m_pReader, pos, len);
-    assert(seekPosSize >= 0);
-    assert((pos + len) <= stop);
-
-    pos += len;  //consume size
-    assert((pos + seekPosSize) <= stop);
-
-    const long long seekOff = UnserializeUInt(m_pReader, pos, seekPosSize);
-    assert(seekOff >= 0);
-    assert(seekOff < m_size);
-
-    pos += seekPosSize;  //consume payload
-    assert(pos == stop);
-
-    const long long seekPos = m_start + seekOff;
-    assert(seekPos < (m_start + m_size));
-
-    if (seekId == 0x0C53BB6B)  //Cues ID
-        ParseCues(seekOff);
-}
-#else
 bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
                           Entry* pEntry) {
   if (size_ <= 0)
@@ -2110,7 +1712,6 @@
 
   return true;
 }
-#endif
 
 Cues::Cues(Segment* pSegment, long long start_, long long size_,
            long long element_start, long long element_size)
@@ -2152,9 +1753,9 @@
   return (m_pos >= stop);
 }
 
-void Cues::Init() const {
+bool Cues::Init() const {
   if (m_cue_points)
-    return;
+    return true;
 
   assert(m_count == 0);
   assert(m_preload_count == 0);
@@ -2172,24 +1773,28 @@
     long len;
 
     const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);  // TODO
-    assert((pos + len) <= stop);
+    if (id < 0 || (pos + len) > stop) {
+      return false;
+    }
 
     pos += len;  // consume ID
 
     const long long size = ReadUInt(pReader, pos, len);
-    assert(size >= 0);
-    assert((pos + len) <= stop);
+    if (size < 0 || (pos + len > stop)) {
+      return false;
+    }
 
     pos += len;  // consume Size field
-    assert((pos + size) <= stop);
+    if (pos + size > stop) {
+      return false;
+    }
 
     if (id == 0x3B)  // CuePoint ID
       PreloadCuePoint(cue_points_size, idpos);
 
-    pos += size;  // consume payload
-    assert(pos <= stop);
+    pos += size;  // skip payload
   }
+  return true;
 }
 
 void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
@@ -2198,7 +1803,7 @@
   if (m_preload_count >= cue_points_size) {
     const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
 
-    CuePoint** const qq = new CuePoint* [n];
+    CuePoint** const qq = new CuePoint*[n];
     CuePoint** q = qq;  // beginning of target
 
     CuePoint** p = m_cue_points;  // beginning of source
@@ -2226,7 +1831,10 @@
   if (m_pos >= stop)
     return false;  // nothing else to do
 
-  Init();
+  if (!Init()) {
+    m_pos = stop;
+    return false;
+  }
 
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
@@ -2263,7 +1871,10 @@
     if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
       return false;
 
-    pCP->Load(pReader);
+    if (!pCP->Load(pReader)) {
+      m_pos = stop;
+      return false;
+    }
     ++m_count;
     --m_preload_count;
 
@@ -2282,62 +1893,6 @@
   assert(time_ns >= 0);
   assert(pTrack);
 
-#if 0
-    LoadCuePoint();  //establish invariant
-
-    assert(m_cue_points);
-    assert(m_count > 0);
-
-    CuePoint** const ii = m_cue_points;
-    CuePoint** i = ii;
-
-    CuePoint** const jj = ii + m_count + m_preload_count;
-    CuePoint** j = jj;
-
-    pCP = *i;
-    assert(pCP);
-
-    if (time_ns <= pCP->GetTime(m_pSegment))
-    {
-        pTP = pCP->Find(pTrack);
-        return (pTP != NULL);
-    }
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    while (i < j)
-    {
-        //INVARIANT:
-        //[ii, i) <= time_ns
-        //[i, j)  ?
-        //[j, jj) > time_ns
-
-        CuePoint** const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        CuePoint* const pCP = *k;
-        assert(pCP);
-
-        pCP->Load(pReader);
-
-        const long long t = pCP->GetTime(m_pSegment);
-
-        if (t <= time_ns)
-            i = k + 1;
-        else
-            j = k;
-
-        assert(i <= j);
-    }
-
-    assert(i == j);
-    assert(i <= jj);
-    assert(i > ii);
-
-    pCP = *--i;
-    assert(pCP);
-    assert(pCP->GetTime(m_pSegment) <= time_ns);
-#else
   if (m_cue_points == NULL)
     return false;
 
@@ -2387,7 +1942,6 @@
   pCP = *--i;
   assert(pCP);
   assert(pCP->GetTime(m_pSegment) <= time_ns);
-#endif
 
   // TODO: here and elsewhere, it's probably not correct to search
   // for the cue point with this time, and then search for a matching
@@ -2401,65 +1955,6 @@
   return (pTP != NULL);
 }
 
-#if 0
-bool Cues::FindNext(
-    long long time_ns,
-    const Track* pTrack,
-    const CuePoint*& pCP,
-    const CuePoint::TrackPosition*& pTP) const
-{
-    pCP = 0;
-    pTP = 0;
-
-    if (m_count == 0)
-        return false;
-
-    assert(m_cue_points);
-
-    const CuePoint* const* const ii = m_cue_points;
-    const CuePoint* const* i = ii;
-
-    const CuePoint* const* const jj = ii + m_count;
-    const CuePoint* const* j = jj;
-
-    while (i < j)
-    {
-        //INVARIANT:
-        //[ii, i) <= time_ns
-        //[i, j)  ?
-        //[j, jj) > time_ns
-
-        const CuePoint* const* const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        pCP = *k;
-        assert(pCP);
-
-        const long long t = pCP->GetTime(m_pSegment);
-
-        if (t <= time_ns)
-            i = k + 1;
-        else
-            j = k;
-
-        assert(i <= j);
-    }
-
-    assert(i == j);
-    assert(i <= jj);
-
-    if (i >= jj)  //time_ns is greater than max cue point
-        return false;
-
-    pCP = *i;
-    assert(pCP);
-    assert(pCP->GetTime(m_pSegment) > time_ns);
-
-    pTP = pCP->Find(pTrack);
-    return (pTP != NULL);
-}
-#endif
-
 const CuePoint* Cues::GetFirst() const {
   if (m_cue_points == NULL)
     return NULL;
@@ -2467,15 +1962,6 @@
   if (m_count == 0)
     return NULL;
 
-#if 0
-    LoadCuePoint();  //init cues
-
-    const size_t count = m_count + m_preload_count;
-
-    if (count == 0)  //weird
-        return NULL;
-#endif
-
   CuePoint* const* const pp = m_cue_points;
   assert(pp);
 
@@ -2493,25 +1979,6 @@
   if (m_count <= 0)
     return NULL;
 
-#if 0
-    LoadCuePoint();  //init cues
-
-    const size_t count = m_count + m_preload_count;
-
-    if (count == 0)  //weird
-        return NULL;
-
-    const size_t index = count - 1;
-
-    CuePoint* const* const pp = m_cue_points;
-    assert(pp);
-
-    CuePoint* const pCP = pp[index];
-    assert(pCP);
-
-    pCP->Load(m_pSegment->m_pReader);
-    assert(pCP->GetTimeCode() >= 0);
-#else
   const long index = m_count - 1;
 
   CuePoint* const* const pp = m_cue_points;
@@ -2520,7 +1987,6 @@
   CuePoint* const pCP = pp[index];
   assert(pCP);
   assert(pCP->GetTimeCode() >= 0);
-#endif
 
   return pCP;
 }
@@ -2533,26 +1999,6 @@
   assert(m_cue_points);
   assert(m_count >= 1);
 
-#if 0
-    const size_t count = m_count + m_preload_count;
-
-    size_t index = pCurr->m_index;
-    assert(index < count);
-
-    CuePoint* const* const pp = m_cue_points;
-    assert(pp);
-    assert(pp[index] == pCurr);
-
-    ++index;
-
-    if (index >= count)
-        return NULL;
-
-    CuePoint* const pNext = pp[index];
-    assert(pNext);
-
-    pNext->Load(m_pSegment->m_pReader);
-#else
   long index = pCurr->m_index;
   assert(index < m_count);
 
@@ -2568,7 +2014,6 @@
   CuePoint* const pNext = pp[index];
   assert(pNext);
   assert(pNext->GetTimeCode() >= 0);
-#endif
 
   return pNext;
 }
@@ -2705,12 +2150,12 @@
 
 CuePoint::~CuePoint() { delete[] m_track_positions; }
 
-void CuePoint::Load(IMkvReader* pReader) {
+bool CuePoint::Load(IMkvReader* pReader) {
   // odbgstream os;
   // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
 
   if (m_timecode >= 0)  // already loaded
-    return;
+    return true;
 
   assert(m_track_positions == NULL);
   assert(m_track_positions_count == 0);
@@ -2726,7 +2171,7 @@
     const long long id = ReadUInt(pReader, pos_, len);
     assert(id == 0x3B);  // CuePoint ID
     if (id != 0x3B)
-      return;
+      return false;
 
     pos_ += len;  // consume ID
 
@@ -2749,17 +2194,21 @@
     long len;
 
     const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);  // TODO
-    assert((pos + len) <= stop);
+    if ((id < 0) || (pos + len > stop)) {
+      return false;
+    }
 
     pos += len;  // consume ID
 
     const long long size = ReadUInt(pReader, pos, len);
-    assert(size >= 0);
-    assert((pos + len) <= stop);
+    if ((size < 0) || (pos + len > stop)) {
+      return false;
+    }
 
     pos += len;  // consume Size field
-    assert((pos + size) <= stop);
+    if ((pos + size) > stop) {
+      return false;
+    }
 
     if (id == 0x33)  // CueTime ID
       m_timecode = UnserializeUInt(pReader, pos, size);
@@ -2768,11 +2217,11 @@
       ++m_track_positions_count;
 
     pos += size;  // consume payload
-    assert(pos <= stop);
   }
 
-  assert(m_timecode >= 0);
-  assert(m_track_positions_count > 0);
+  if (m_timecode < 0 || m_track_positions_count <= 0) {
+    return false;
+  }
 
   // os << "CuePoint::Load(cont'd): idpos=" << idpos
   //   << " timecode=" << m_timecode
@@ -2789,7 +2238,7 @@
     long len;
 
     const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);  // TODO
+    assert(id >= 0);
     assert((pos + len) <= stop);
 
     pos += len;  // consume ID
@@ -2803,7 +2252,9 @@
 
     if (id == 0x37) {  // CueTrackPosition(s) ID
       TrackPosition& tp = *p++;
-      tp.Parse(pReader, pos, size);
+      if (!tp.Parse(pReader, pos, size)) {
+        return false;
+      }
     }
 
     pos += size;  // consume payload
@@ -2814,9 +2265,11 @@
 
   m_element_start = element_start;
   m_element_size = element_size;
+
+  return true;
 }
 
-void CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
                                     long long size_) {
   const long long stop = start_ + size_;
   long long pos = start_;
@@ -2829,17 +2282,21 @@
     long len;
 
     const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);  // TODO
-    assert((pos + len) <= stop);
+    if ((id < 0) || ((pos + len) > stop)) {
+      return false;
+    }
 
     pos += len;  // consume ID
 
     const long long size = ReadUInt(pReader, pos, len);
-    assert(size >= 0);
-    assert((pos + len) <= stop);
+    if ((size < 0) || ((pos + len) > stop)) {
+      return false;
+    }
 
     pos += len;  // consume Size field
-    assert((pos + size) <= stop);
+    if ((pos + size) > stop) {
+      return false;
+    }
 
     if (id == 0x77)  // CueTrack ID
       m_track = UnserializeUInt(pReader, pos, size);
@@ -2851,12 +2308,13 @@
       m_block = UnserializeUInt(pReader, pos, size);
 
     pos += size;  // consume payload
-    assert(pos <= stop);
   }
 
-  assert(m_pos >= 0);
-  assert(m_track > 0);
-  // assert(m_block > 0);
+  if ((m_pos < 0) || (m_track <= 0)) {
+    return false;
+  }
+
+  return true;
 }
 
 const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
@@ -2894,20 +2352,6 @@
   return time;
 }
 
-#if 0
-long long Segment::Unparsed() const
-{
-    if (m_size < 0)
-        return LLONG_MAX;
-
-    const long long stop = m_start + m_size;
-
-    const long long result = stop - m_pos;
-    assert(result >= 0);
-
-    return result;
-}
-#else
 bool Segment::DoneParsing() const {
   if (m_size < 0) {
     long long total, avail;
@@ -2927,7 +2371,6 @@
 
   return (m_pos >= stop);
 }
-#endif
 
 const Cluster* Segment::GetFirst() const {
   if ((m_clusters == NULL) || (m_clusterCount <= 0))
@@ -3395,15 +2838,7 @@
       continue;
     }
 
-#if 0  // this is commented-out to support incremental cluster parsing
-        len = static_cast<long>(size);
-
-        if (element_stop > avail)
-            return E_BUFFER_NOT_FULL;
-#endif
-
     // We have a cluster.
-
     off_next = idoff;
 
     if (size != unknown_size)
@@ -3647,188 +3082,11 @@
   return pCluster;
 }
 
-#if 0
-const BlockEntry* Segment::Seek(
-    long long time_ns,
-    const Track* pTrack) const
-{
-    assert(pTrack);
-
-    if ((m_clusters == NULL) || (m_clusterCount <= 0))
-        return pTrack->GetEOS();
-
-    Cluster** const i = m_clusters;
-    assert(i);
-
-    {
-        Cluster* const pCluster = *i;
-        assert(pCluster);
-        assert(pCluster->m_index == 0);  //m_clusterCount > 0
-        assert(pCluster->m_pSegment == this);
-
-        if (time_ns <= pCluster->GetTime())
-            return pCluster->GetEntry(pTrack);
-    }
-
-    Cluster** const j = i + m_clusterCount;
-
-    if (pTrack->GetType() == 2) {  //audio
-        //TODO: we could decide to use cues for this, as we do for video.
-        //But we only use it for video because looking around for a keyframe
-        //can get expensive.  Audio doesn't require anything special so a
-        //straight cluster search is good enough (we assume).
-
-        Cluster** lo = i;
-        Cluster** hi = j;
-
-        while (lo < hi)
-        {
-            //INVARIANT:
-            //[i, lo) <= time_ns
-            //[lo, hi) ?
-            //[hi, j)  > time_ns
-
-            Cluster** const mid = lo + (hi - lo) / 2;
-            assert(mid < hi);
-
-            Cluster* const pCluster = *mid;
-            assert(pCluster);
-            assert(pCluster->m_index == long(mid - m_clusters));
-            assert(pCluster->m_pSegment == this);
-
-            const long long t = pCluster->GetTime();
-
-            if (t <= time_ns)
-                lo = mid + 1;
-            else
-                hi = mid;
-
-            assert(lo <= hi);
-        }
-
-        assert(lo == hi);
-        assert(lo > i);
-        assert(lo <= j);
-
-        while (lo > i)
-        {
-            Cluster* const pCluster = *--lo;
-            assert(pCluster);
-            assert(pCluster->GetTime() <= time_ns);
-
-            const BlockEntry* const pBE = pCluster->GetEntry(pTrack);
-
-            if ((pBE != 0) && !pBE->EOS())
-                return pBE;
-
-            //landed on empty cluster (no entries)
-        }
-
-        return pTrack->GetEOS();  //weird
-    }
-
-    assert(pTrack->GetType() == 1);  //video
-
-    Cluster** lo = i;
-    Cluster** hi = j;
-
-    while (lo < hi)
-    {
-        //INVARIANT:
-        //[i, lo) <= time_ns
-        //[lo, hi) ?
-        //[hi, j)  > time_ns
-
-        Cluster** const mid = lo + (hi - lo) / 2;
-        assert(mid < hi);
-
-        Cluster* const pCluster = *mid;
-        assert(pCluster);
-
-        const long long t = pCluster->GetTime();
-
-        if (t <= time_ns)
-            lo = mid + 1;
-        else
-            hi = mid;
-
-        assert(lo <= hi);
-    }
-
-    assert(lo == hi);
-    assert(lo > i);
-    assert(lo <= j);
-
-    Cluster* pCluster = *--lo;
-    assert(pCluster);
-    assert(pCluster->GetTime() <= time_ns);
-
-    {
-        const BlockEntry* const pBE = pCluster->GetEntry(pTrack, time_ns);
-
-        if ((pBE != 0) && !pBE->EOS())  //found a keyframe
-            return pBE;
-    }
-
-    const VideoTrack* const pVideo = static_cast<const VideoTrack*>(pTrack);
-
-    while (lo != i)
-    {
-        pCluster = *--lo;
-        assert(pCluster);
-        assert(pCluster->GetTime() <= time_ns);
-
-        const BlockEntry* const pBlockEntry = pCluster->GetMaxKey(pVideo);
-
-        if ((pBlockEntry != 0) && !pBlockEntry->EOS())
-            return pBlockEntry;
-    }
-
-    //weird: we're on the first cluster, but no keyframe found
-    //should never happen but we must return something anyway
-
-    return pTrack->GetEOS();
-}
-#endif
-
-#if 0
-bool Segment::SearchCues(
-    long long time_ns,
-    Track* pTrack,
-    Cluster*& pCluster,
-    const BlockEntry*& pBlockEntry,
-    const CuePoint*& pCP,
-    const CuePoint::TrackPosition*& pTP)
-{
-    if (pTrack->GetType() != 1)  //not video
-        return false;  //TODO: for now, just handle video stream
-
-    if (m_pCues == NULL)
-        return false;
-
-    if (!m_pCues->Find(time_ns, pTrack, pCP, pTP))
-        return false;  //weird
-
-    assert(pCP);
-    assert(pTP);
-    assert(pTP->m_track == pTrack->GetNumber());
-
-    //We have the cue point and track position we want,
-    //so we now need to search for the cluster having
-    //the indicated position.
-
-    return GetCluster(pCP, pTP, pCluster, pBlockEntry);
-}
-#endif
-
 const Tracks* Segment::GetTracks() const { return m_pTracks; }
-
 const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
-
 const Cues* Segment::GetCues() const { return m_pCues; }
-
 const Chapters* Segment::GetChapters() const { return m_pChapters; }
-
+const Tags* Segment::GetTags() const { return m_pTags; }
 const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
 
 long long Segment::GetDuration() const {
@@ -3853,6 +3111,7 @@
     Edition& e = m_editions[--m_editions_count];
     e.Clear();
   }
+  delete[] m_editions;
 }
 
 long Chapters::Parse() {
@@ -4128,12 +3387,13 @@
       if (status < 0)  // error
         return status;
     } else if (id == 0x33C4) {  // UID ID
-      const long long val = UnserializeUInt(pReader, pos, size);
+      long long val;
+      status = UnserializeInt(pReader, pos, size, val);
 
-      if (val < 0)  // error
-        return static_cast<long>(val);
+      if (status < 0)  // error
+        return status;
 
-      m_uid = val;
+      m_uid = static_cast<unsigned long long>(val);
     } else if (id == 0x11) {  // TimeStart ID
       const long long val = UnserializeUInt(pReader, pos, size);
 
@@ -4292,6 +3552,277 @@
   return 0;
 }
 
+Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_tags(NULL),
+      m_tags_size(0),
+      m_tags_count(0) {}
+
+Tags::~Tags() {
+  while (m_tags_count > 0) {
+    Tag& t = m_tags[--m_tags_count];
+    t.Clear();
+  }
+  delete[] m_tags;
+}
+
+long Tags::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == 0x3373) {  // Tag ID
+      status = ParseTag(pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    assert(pos <= stop);
+    if (pos > stop)
+      return -1;
+  }
+
+  assert(pos == stop);
+  if (pos != stop)
+    return -1;
+
+  return 0;
+}
+
+int Tags::GetTagCount() const { return m_tags_count; }
+
+const Tags::Tag* Tags::GetTag(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_tags_count)
+    return NULL;
+
+  return m_tags + idx;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (m_tags_size > m_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size;
+
+  Tag* const tags = new (std::nothrow) Tag[size];
+
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_tags_count; ++idx) {
+    m_tags[idx].ShallowCopy(tags[idx]);
+  }
+
+  delete[] m_tags;
+  m_tags = tags;
+
+  m_tags_size = size;
+  return true;
+}
+
+long Tags::ParseTag(long long pos, long long size) {
+  if (!ExpandTagsArray())
+    return -1;
+
+  Tag& t = m_tags[m_tags_count++];
+  t.Init();
+
+  return t.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Tags::Tag::Tag() {}
+
+Tags::Tag::~Tag() {}
+
+int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; }
+
+const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_simple_tags_count)
+    return NULL;
+
+  return m_simple_tags + index;
+}
+
+void Tags::Tag::Init() {
+  m_simple_tags = NULL;
+  m_simple_tags_size = 0;
+  m_simple_tags_count = 0;
+}
+
+void Tags::Tag::ShallowCopy(Tag& rhs) const {
+  rhs.m_simple_tags = m_simple_tags;
+  rhs.m_simple_tags_size = m_simple_tags_size;
+  rhs.m_simple_tags_count = m_simple_tags_count;
+}
+
+void Tags::Tag::Clear() {
+  while (m_simple_tags_count > 0) {
+    SimpleTag& d = m_simple_tags[--m_simple_tags_count];
+    d.Clear();
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = NULL;
+
+  m_simple_tags_size = 0;
+}
+
+long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == 0x27C8) {  // SimpleTag ID
+      status = ParseSimpleTag(pReader, pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    assert(pos <= stop);
+    if (pos > stop)
+      return -1;
+  }
+
+  assert(pos == stop);
+  if (pos != stop)
+    return -1;
+  return 0;
+}
+
+long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos,
+                               long long size) {
+  if (!ExpandSimpleTagsArray())
+    return -1;
+
+  SimpleTag& st = m_simple_tags[m_simple_tags_count++];
+  st.Init();
+
+  return st.Parse(pReader, pos, size);
+}
+
+bool Tags::Tag::ExpandSimpleTagsArray() {
+  if (m_simple_tags_size > m_simple_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size;
+
+  SimpleTag* const displays = new (std::nothrow) SimpleTag[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_simple_tags_count; ++idx) {
+    m_simple_tags[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = displays;
+
+  m_simple_tags_size = size;
+  return true;
+}
+
+Tags::SimpleTag::SimpleTag() {}
+
+Tags::SimpleTag::~SimpleTag() {}
+
+const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; }
+
+const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; }
+
+void Tags::SimpleTag::Init() {
+  m_tag_name = NULL;
+  m_tag_string = NULL;
+}
+
+void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const {
+  rhs.m_tag_name = m_tag_name;
+  rhs.m_tag_string = m_tag_string;
+}
+
+void Tags::SimpleTag::Clear() {
+  delete[] m_tag_name;
+  m_tag_name = NULL;
+
+  delete[] m_tag_string;
+  m_tag_string = NULL;
+}
+
+long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
+                            long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == 0x5A3) {  // TagName ID
+      status = UnserializeString(pReader, pos, size, m_tag_name);
+
+      if (status)
+        return status;
+    } else if (id == 0x487) {  // TagString ID
+      status = UnserializeString(pReader, pos, size, m_tag_string);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    assert(pos <= stop);
+    if (pos > stop)
+      return -1;
+  }
+
+  assert(pos == stop);
+  if (pos != stop)
+    return -1;
+  return 0;
+}
+
 SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
                          long long element_start, long long element_size)
     : m_pSegment(pSegment),
@@ -4458,7 +3989,7 @@
 }
 
 const ContentEncoding::ContentCompression*
-ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+    ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
   const ptrdiff_t count = compression_entries_end_ - compression_entries_;
   assert(count >= 0);
 
@@ -4554,7 +4085,7 @@
 
   if (compression_count > 0) {
     compression_entries_ =
-        new (std::nothrow) ContentCompression* [compression_count];
+        new (std::nothrow) ContentCompression*[compression_count];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
@@ -4562,7 +4093,7 @@
 
   if (encryption_count > 0) {
     encryption_entries_ =
-        new (std::nothrow) ContentEncryption* [encryption_count];
+        new (std::nothrow) ContentEncryption*[encryption_count];
     if (!encryption_entries_) {
       delete[] compression_entries_;
       return -1;
@@ -4703,7 +4234,7 @@
         return E_FILE_FORMAT_INVALID;
     } else if (id == 0x7E2) {
       // ContentEncKeyID
-      delete[] encryption -> key_id;
+      delete[] encryption->key_id;
       encryption->key_id = NULL;
       encryption->key_id_len = 0;
 
@@ -4727,7 +4258,7 @@
       encryption->key_id_len = buflen;
     } else if (id == 0x7E3) {
       // ContentSignature
-      delete[] encryption -> signature;
+      delete[] encryption->signature;
       encryption->signature = NULL;
       encryption->signature_len = 0;
 
@@ -4751,7 +4282,7 @@
       encryption->signature_len = buflen;
     } else if (id == 0x7E4) {
       // ContentSigKeyID
-      delete[] encryption -> sig_key_id;
+      delete[] encryption->sig_key_id;
       encryption->sig_key_id = NULL;
       encryption->sig_key_id_len = 0;
 
@@ -4991,17 +4522,10 @@
     }
 
     if (pCluster->EOS()) {
-#if 0
-            if (m_pSegment->Unparsed() <= 0) {  //all clusters have been loaded
-                pBlockEntry = GetEOS();
-                return 1;
-            }
-#else
       if (m_pSegment->DoneParsing()) {
         pBlockEntry = GetEOS();
         return 1;
       }
-#endif
 
       pBlockEntry = 0;
       return E_BUFFER_NOT_FULL;
@@ -5098,18 +4622,10 @@
     }
 
     if (pCluster->EOS()) {
-#if 0
-            if (m_pSegment->Unparsed() <= 0)   //all clusters have been loaded
-            {
-                pNextEntry = GetEOS();
-                return 1;
-            }
-#else
       if (m_pSegment->DoneParsing()) {
         pNextEntry = GetEOS();
         return 1;
       }
-#endif
 
       // TODO: there is a potential O(n^2) problem here: we tell the
       // caller to (pre)load another cluster, which he does, but then he
@@ -5291,7 +4807,7 @@
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding* [count];
+  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
   if (!content_encoding_entries_)
     return -1;
 
@@ -5350,6 +4866,11 @@
 
   long long width = 0;
   long long height = 0;
+  long long display_width = 0;
+  long long display_height = 0;
+  long long display_unit = 0;
+  long long stereo_mode = 0;
+
   double rate = 0.0;
 
   IMkvReader* const pReader = pSegment->m_pReader;
@@ -5381,6 +4902,26 @@
 
       if (height <= 0)
         return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x14B0) {  // display width
+      display_width = UnserializeUInt(pReader, pos, size);
+
+      if (display_width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x14BA) {  // display height
+      display_height = UnserializeUInt(pReader, pos, size);
+
+      if (display_height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x14B2) {  // display unit
+      display_unit = UnserializeUInt(pReader, pos, size);
+
+      if (display_unit < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x13B8) {  // stereo mode
+      stereo_mode = UnserializeUInt(pReader, pos, size);
+
+      if (stereo_mode < 0)
+        return E_FILE_FORMAT_INVALID;
     } else if (id == 0x0383E3) {  // frame rate
       const long status = UnserializeFloat(pReader, pos, size, rate);
 
@@ -5412,6 +4953,10 @@
 
   pTrack->m_width = width;
   pTrack->m_height = height;
+  pTrack->m_display_width = display_width;
+  pTrack->m_display_height = display_height;
+  pTrack->m_display_unit = display_unit;
+  pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
 
   pResult = pTrack;
@@ -5498,16 +5043,7 @@
     assert(pCluster);
     assert(pCluster->GetTime() <= time_ns);
 
-#if 0
-        //TODO:
-        //We need to handle the case when a cluster
-        //contains multiple keyframes.  Simply returning
-        //the largest keyframe on the cluster isn't
-        //good enough.
-        pResult = pCluster->GetMaxKey(this);
-#else
     pResult = pCluster->GetEntry(this, time_ns);
-#endif
 
     if ((pResult != 0) && !pResult->EOS())
       return 0;
@@ -5524,6 +5060,18 @@
 
 long long VideoTrack::GetHeight() const { return m_height; }
 
+long long VideoTrack::GetDisplayWidth() const {
+  return m_display_width > 0 ? m_display_width : GetWidth();
+}
+
+long long VideoTrack::GetDisplayHeight() const {
+  return m_display_height > 0 ? m_display_height : GetHeight();
+}
+
+long long VideoTrack::GetDisplayUnit() const { return m_display_unit; }
+
+long long VideoTrack::GetStereoMode() const { return m_stereo_mode; }
+
 double VideoTrack::GetFrameRate() const { return m_rate; }
 
 AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
@@ -5658,7 +5206,7 @@
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track* [count];
+  m_trackEntries = new (std::nothrow) Track*[count];
 
   if (m_trackEntries == NULL)
     return -1;
@@ -5941,7 +5489,7 @@
     if (v.start >= 0)
       return E_FILE_FORMAT_INVALID;
 
-    if (e.start >= 0)
+    if (info.type == Track::kMetadata && e.start >= 0)
       return E_FILE_FORMAT_INVALID;
 
     info.settings.start = -1;
@@ -6003,25 +5551,6 @@
   return m_trackEntries[idx];
 }
 
-#if 0
-long long Cluster::Unparsed() const
-{
-    if (m_timecode < 0)  //not even partially loaded
-        return LLONG_MAX;
-
-    assert(m_pos >= m_element_start);
-    //assert(m_element_size > m_size);
-
-    const long long element_stop = m_element_start + m_element_size;
-    assert(m_pos <= element_stop);
-
-    const long long result = element_stop - m_pos;
-    assert(result >= 0);
-
-    return result;
-}
-#endif
-
 long Cluster::Load(long long& pos, long& len) const {
   assert(m_pSegment);
   assert(m_pos >= m_element_start);
@@ -6115,15 +5644,7 @@
       cluster_size = size;
   }
 
-// pos points to start of payload
-
-#if 0
-    len = static_cast<long>(size_);
-
-    if (cluster_stop > avail)
-        return E_BUFFER_NOT_FULL;
-#endif
-
+  // pos points to start of payload
   long long timecode = -1;
   long long new_pos = -1;
   bool bBlock = false;
@@ -6495,36 +6016,6 @@
   if (track == 0)
     return E_FILE_FORMAT_INVALID;
 
-#if 0
-    //TODO(matthewjheaney)
-    //This turned out to be too conservative.  The problem is that
-    //if we see a track header in the tracks element with an unsupported
-    //track type, we throw that track header away, so it is not present
-    //in the track map.  But even though we don't understand the track
-    //header, there are still blocks in the cluster with that track
-    //number.  It was our decision to ignore that track header, so it's
-    //up to us to deal with blocks associated with that track -- we
-    //cannot simply report an error since technically there's nothing
-    //wrong with the file.
-    //
-    //For now we go ahead and finish the parse, creating a block entry
-    //for this block.  This is somewhat wasteful, because without a
-    //track header there's nothing you can do with the block. What
-    //we really need here is a special return value that indicates to
-    //the caller that he should ignore this particular block, and
-    //continue parsing.
-
-    const Tracks* const pTracks = m_pSegment->GetTracks();
-    assert(pTracks);
-
-    const long tn = static_cast<long>(track);
-
-    const Track* const pTrack = pTracks->GetTrackByNumber(tn);
-
-    if (pTrack == NULL)
-        return E_FILE_FORMAT_INVALID;
-#endif
-
   pos += len;  // consume track number
 
   if ((pos + 2) > block_stop)
@@ -6679,12 +6170,7 @@
       return E_FILE_FORMAT_INVALID;
 
     if (id == 0x35A2) {  // DiscardPadding
-      result = GetUIntLength(pReader, pos, len);
-
-      if (result < 0)  // error
-        return static_cast<long>(result);
-
-      status = UnserializeInt(pReader, pos, len, discard_padding);
+      status = UnserializeInt(pReader, pos, size, discard_padding);
 
       if (status < 0)  // error
         return status;
@@ -6733,36 +6219,6 @@
     if (track == 0)
       return E_FILE_FORMAT_INVALID;
 
-#if 0
-        //TODO(matthewjheaney)
-        //This turned out to be too conservative.  The problem is that
-        //if we see a track header in the tracks element with an unsupported
-        //track type, we throw that track header away, so it is not present
-        //in the track map.  But even though we don't understand the track
-        //header, there are still blocks in the cluster with that track
-        //number.  It was our decision to ignore that track header, so it's
-        //up to us to deal with blocks associated with that track -- we
-        //cannot simply report an error since technically there's nothing
-        //wrong with the file.
-        //
-        //For now we go ahead and finish the parse, creating a block entry
-        //for this block.  This is somewhat wasteful, because without a
-        //track header there's nothing you can do with the block. What
-        //we really need here is a special return value that indicates to
-        //the caller that he should ignore this particular block, and
-        //continue parsing.
-
-        const Tracks* const pTracks = m_pSegment->GetTracks();
-        assert(pTracks);
-
-        const long tn = static_cast<long>(track);
-
-        const Track* const pTrack = pTracks->GetTrackByNumber(tn);
-
-        if (pTrack == NULL)
-            return E_FILE_FORMAT_INVALID;
-#endif
-
     pos += len;  // consume track number
 
     if ((pos + 2) > block_stop)
@@ -6924,68 +6380,6 @@
 
 long long Cluster::GetElementSize() const { return m_element_size; }
 
-#if 0
-bool Cluster::HasBlockEntries(
-    const Segment* pSegment,
-    long long off) {
-    assert(pSegment);
-    assert(off >= 0);  //relative to start of segment payload
-
-    IMkvReader* const pReader = pSegment->m_pReader;
-
-    long long pos = pSegment->m_start + off;  //absolute
-    long long size;
-
-    {
-        long len;
-
-        const long long id = ReadUInt(pReader, pos, len);
-        (void)id;
-        assert(id >= 0);
-        assert(id == 0x0F43B675);  //Cluster ID
-
-        pos += len;  //consume id
-
-        size = ReadUInt(pReader, pos, len);
-        assert(size > 0);
-
-        pos += len;  //consume size
-
-        //pos now points to start of payload
-    }
-
-    const long long stop = pos + size;
-
-    while (pos < stop)
-    {
-        long len;
-
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume id
-
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume size
-
-        if (id == 0x20)  //BlockGroup ID
-            return true;
-
-        if (id == 0x23)  //SimpleBlock ID
-            return true;
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    return false;
-}
-#endif
-
 long Cluster::HasBlockEntries(
     const Segment* pSegment,
     long long off,  // relative to start of segment payload
@@ -7269,7 +6663,7 @@
     assert(m_entries_size == 0);
 
     m_entries_size = 1024;
-    m_entries = new BlockEntry* [m_entries_size];
+    m_entries = new BlockEntry*[m_entries_size];
 
     m_entries_count = 0;
   } else {
@@ -7280,7 +6674,7 @@
     if (m_entries_count >= m_entries_size) {
       const long entries_size = 2 * m_entries_size;
 
-      BlockEntry** const entries = new BlockEntry* [entries_size];
+      BlockEntry** const entries = new BlockEntry*[entries_size];
       assert(entries);
 
       BlockEntry** src = m_entries;
@@ -7349,12 +6743,16 @@
         bsize = size;
       }
     } else if (id == 0x1B) {  // Duration ID
-      assert(size <= 8);
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
 
       duration = UnserializeUInt(pReader, pos, size);
-      assert(duration >= 0);  // TODO
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
     } else if (id == 0x7B) {  // ReferenceBlock
-      assert(size <= 8);
+      if (size > 8 || size <= 0)
+        return E_FILE_FORMAT_INVALID;
       const long size_ = static_cast<long>(size);
 
       long long time;
@@ -7373,9 +6771,10 @@
     pos += size;  // consume payload
     assert(pos <= stop);
   }
+  if (bpos < 0)
+    return E_FILE_FORMAT_INVALID;
 
   assert(pos == stop);
-  assert(bpos >= 0);
   assert(bsize >= 0);
 
   const long idx = m_entries_count;
@@ -7539,57 +6938,6 @@
   if (m_pSegment == NULL)  // this is the special EOS cluster
     return pTrack->GetEOS();
 
-#if 0
-
-    LoadBlockEntries();
-
-    if ((m_entries == NULL) || (m_entries_count <= 0))
-        return NULL;  //return EOS here?
-
-    const BlockEntry* pResult = pTrack->GetEOS();
-
-    BlockEntry** i = m_entries;
-    assert(i);
-
-    BlockEntry** const j = i + m_entries_count;
-
-    while (i != j)
-    {
-        const BlockEntry* const pEntry = *i++;
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if (pBlock->GetTrackNumber() != pTrack->GetNumber())
-            continue;
-
-        if (pTrack->VetEntry(pEntry))
-        {
-            if (time_ns < 0)  //just want first candidate block
-                return pEntry;
-
-            const long long ns = pBlock->GetTime(this);
-
-            if (ns > time_ns)
-                break;
-
-            pResult = pEntry;
-        }
-        else if (time_ns >= 0)
-        {
-            const long long ns = pBlock->GetTime(this);
-
-            if (ns > time_ns)
-                break;
-        }
-    }
-
-    return pResult;
-
-#else
-
   const BlockEntry* pResult = pTrack->GetEOS();
 
   long index = 0;
@@ -7643,103 +6991,11 @@
 
     ++index;
   }
-
-#endif
 }
 
 const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
                                     const CuePoint::TrackPosition& tp) const {
   assert(m_pSegment);
-
-#if 0
-
-    LoadBlockEntries();
-
-    if (m_entries == NULL)
-        return NULL;
-
-    const long long count = m_entries_count;
-
-    if (count <= 0)
-        return NULL;
-
-    const long long tc = cp.GetTimeCode();
-
-    if ((tp.m_block > 0) && (tp.m_block <= count))
-    {
-        const size_t block = static_cast<size_t>(tp.m_block);
-        const size_t index = block - 1;
-
-        const BlockEntry* const pEntry = m_entries[index];
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if ((pBlock->GetTrackNumber() == tp.m_track) &&
-            (pBlock->GetTimeCode(this) == tc))
-        {
-            return pEntry;
-        }
-    }
-
-    const BlockEntry* const* i = m_entries;
-    const BlockEntry* const* const j = i + count;
-
-    while (i != j)
-    {
-#ifdef _DEBUG
-        const ptrdiff_t idx = i - m_entries;
-        idx;
-#endif
-
-        const BlockEntry* const pEntry = *i++;
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if (pBlock->GetTrackNumber() != tp.m_track)
-            continue;
-
-        const long long tc_ = pBlock->GetTimeCode(this);
-        assert(tc_ >= 0);
-
-        if (tc_ < tc)
-            continue;
-
-        if (tc_ > tc)
-            return NULL;
-
-        const Tracks* const pTracks = m_pSegment->GetTracks();
-        assert(pTracks);
-
-        const long tn = static_cast<long>(tp.m_track);
-        const Track* const pTrack = pTracks->GetTrackByNumber(tn);
-
-        if (pTrack == NULL)
-            return NULL;
-
-        const long long type = pTrack->GetType();
-
-        if (type == 2)  //audio
-            return pEntry;
-
-        if (type != 1)  //not video
-            return NULL;
-
-        if (!pBlock->IsKey())
-            return NULL;
-
-        return pEntry;
-    }
-
-    return NULL;
-
-#else
-
   const long long tc = cp.GetTimeCode();
 
   if (tp.m_block > 0) {
@@ -7835,54 +7091,12 @@
 
     return pEntry;
   }
-
-#endif
 }
 
-#if 0
-const BlockEntry* Cluster::GetMaxKey(const VideoTrack* pTrack) const
-{
-    assert(pTrack);
-
-    if (m_pSegment == NULL)  //EOS
-        return pTrack->GetEOS();
-
-    LoadBlockEntries();
-
-    if ((m_entries == NULL) || (m_entries_count <= 0))
-        return pTrack->GetEOS();
-
-    BlockEntry** i = m_entries + m_entries_count;
-    BlockEntry** const j = m_entries;
-
-    while (i != j)
-    {
-        const BlockEntry* const pEntry = *--i;
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if (pBlock->GetTrackNumber() != pTrack->GetNumber())
-            continue;
-
-        if (pBlock->IsKey())
-            return pEntry;
-    }
-
-    return pTrack->GetEOS();  //no satisfactory block found
-}
-#endif
-
 BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
-
 BlockEntry::~BlockEntry() {}
-
 bool BlockEntry::EOS() const { return (GetKind() == kBlockEOS); }
-
 const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
-
 long BlockEntry::GetIndex() const { return m_index; }
 
 SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
@@ -7890,9 +7104,7 @@
     : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
 
 long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
-
 BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
-
 const Block* SimpleBlock::GetBlock() const { return &m_block; }
 
 BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
@@ -7915,30 +7127,10 @@
   return 0;
 }
 
-#if 0
-void BlockGroup::ParseBlock(long long start, long long size)
-{
-    IMkvReader* const pReader = m_pCluster->m_pSegment->m_pReader;
-
-    Block* const pBlock = new Block(start, size, pReader);
-    assert(pBlock);  //TODO
-
-    //TODO: the Matroska spec says you have multiple blocks within the
-    //same block group, with blocks ranked by priority (the flag bits).
-
-    assert(m_pBlock == NULL);
-    m_pBlock = pBlock;
-}
-#endif
-
 BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
-
 const Block* BlockGroup::GetBlock() const { return &m_block; }
-
 long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
-
 long long BlockGroup::GetNextTimeCode() const { return m_next; }
-
 long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
 
 Block::Block(long long start, long long size_, long long discard_padding)
@@ -8028,7 +7220,7 @@
 
     const long long frame_size = stop - pos;
 
-    if (frame_size > LONG_MAX)
+    if (frame_size > LONG_MAX || frame_size <= 0)
       return E_FILE_FORMAT_INVALID;
 
     f.len = static_cast<long>(frame_size);
@@ -8088,6 +7280,9 @@
 
       f.pos = 0;  // patch later
 
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
       f.len = frame_size;
       size += frame_size;  // contribution of this frame
 
@@ -8112,7 +7307,7 @@
 
       const long long frame_size = total_size - size;
 
-      if (frame_size > LONG_MAX)
+      if (frame_size > LONG_MAX || frame_size <= 0)
         return E_FILE_FORMAT_INVALID;
 
       f.len = static_cast<long>(frame_size);
@@ -8129,6 +7324,9 @@
 
     assert(pos == stop);
   } else if (lacing == 2) {  // fixed-size lacing
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
     const long long total_size = stop - pos;
 
     if ((total_size % m_frame_count) != 0)
@@ -8136,7 +7334,7 @@
 
     const long long frame_size = total_size / m_frame_count;
 
-    if (frame_size > LONG_MAX)
+    if (frame_size > LONG_MAX || frame_size <= 0)
       return E_FILE_FORMAT_INVALID;
 
     Frame* pf = m_frames;
@@ -8165,7 +7363,7 @@
 
     long long frame_size = ReadUInt(pReader, pos, len);
 
-    if (frame_size < 0)
+    if (frame_size <= 0)
       return E_FILE_FORMAT_INVALID;
 
     if (frame_size > LONG_MAX)
@@ -8227,7 +7425,7 @@
 
       frame_size += delta_size;
 
-      if (frame_size < 0)
+      if (frame_size <= 0)
         return E_FILE_FORMAT_INVALID;
 
       if (frame_size > LONG_MAX)
@@ -8239,7 +7437,8 @@
       --frame_count;
     }
 
-    {
+    // parse last frame
+    if (frame_count > 0) {
       assert(pos <= stop);
       assert(pf < pf_end);
 
@@ -8262,7 +7461,7 @@
 
       frame_size = total_size - size;
 
-      if (frame_size > LONG_MAX)
+      if (frame_size > LONG_MAX || frame_size <= 0)
         return E_FILE_FORMAT_INVALID;
 
       curr.len = static_cast<long>(frame_size);
@@ -8277,7 +7476,8 @@
       pos += f.len;
     }
 
-    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
   }
 
   return 0;  // success

diff --git a/libvpx/third_party/libwebm/mkvparser.hpp b/libvpx/third_party/libwebm/mkvparser.hpp
index 3e17d07..aa0b432 100644
--- a/libvpx/third_party/libwebm/mkvparser.hpp
+++ b/libvpx/third_party/libwebm/mkvparser.hpp

@@ -32,7 +32,8 @@
 long long UnserializeUInt(IMkvReader*, long long pos, long long size);
 
 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
-long UnserializeInt(IMkvReader*, long long pos, long len, long long& result);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+                    long long& result);
 
 long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
 
@@ -398,6 +399,10 @@
 
   long long GetWidth() const;
   long long GetHeight() const;
+  long long GetDisplayWidth() const;
+  long long GetDisplayHeight() const;
+  long long GetDisplayUnit() const;
+  long long GetStereoMode() const;
   double GetFrameRate() const;
 
   bool VetEntry(const BlockEntry*) const;
@@ -406,6 +411,11 @@
  private:
   long long m_width;
   long long m_height;
+  long long m_display_width;
+  long long m_display_height;
+  long long m_display_unit;
+  long long m_stereo_mode;
+
   double m_rate;
 };
 
@@ -582,6 +592,85 @@
   int m_editions_count;
 };
 
+class Tags {
+  Tags(const Tags&);
+  Tags& operator=(const Tags&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tags(Segment*, long long payload_start, long long payload_size,
+       long long element_start, long long element_size);
+
+  ~Tags();
+
+  long Parse();
+
+  class Tag;
+  class SimpleTag;
+
+  class SimpleTag {
+    friend class Tag;
+    SimpleTag();
+    SimpleTag(const SimpleTag&);
+    ~SimpleTag();
+    SimpleTag& operator=(const SimpleTag&);
+
+   public:
+    const char* GetTagName() const;
+    const char* GetTagString() const;
+
+   private:
+    void Init();
+    void ShallowCopy(SimpleTag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_tag_name;
+    char* m_tag_string;
+  };
+
+  class Tag {
+    friend class Tags;
+    Tag();
+    Tag(const Tag&);
+    ~Tag();
+    Tag& operator=(const Tag&);
+
+   public:
+    int GetSimpleTagCount() const;
+    const SimpleTag* GetSimpleTag(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Tag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+    bool ExpandSimpleTagsArray();
+
+    SimpleTag* m_simple_tags;
+    int m_simple_tags_size;
+    int m_simple_tags_count;
+  };
+
+  int GetTagCount() const;
+  const Tag* GetTag(int index) const;
+
+ private:
+  long ParseTag(long long pos, long long size);
+  bool ExpandTagsArray();
+
+  Tag* m_tags;
+  int m_tags_size;
+  int m_tags_count;
+};
+
 class SegmentInfo {
   SegmentInfo(const SegmentInfo&);
   SegmentInfo& operator=(const SegmentInfo&);
@@ -684,7 +773,7 @@
   long long m_element_start;
   long long m_element_size;
 
-  void Load(IMkvReader*);
+  bool Load(IMkvReader*);
 
   long long GetTimeCode() const;  // absolute but unscaled
   long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
@@ -697,7 +786,7 @@
     // reference = clusters containing req'd referenced blocks
     //  reftime = timecode of the referenced block
 
-    void Parse(IMkvReader*, long long, long long);
+    bool Parse(IMkvReader*, long long, long long);
   };
 
   const TrackPosition* Find(const Track*) const;
@@ -730,14 +819,6 @@
       long long time_ns, const Track*, const CuePoint*&,
       const CuePoint::TrackPosition*&) const;
 
-#if 0
-    bool FindNext(  //upper_bound of time_ns
-        long long time_ns,
-        const Track*,
-        const CuePoint*&,
-        const CuePoint::TrackPosition*&) const;
-#endif
-
   const CuePoint* GetFirst() const;
   const CuePoint* GetLast() const;
   const CuePoint* GetNext(const CuePoint*) const;
@@ -751,7 +832,7 @@
   bool DoneParsing() const;
 
  private:
-  void Init() const;
+  bool Init() const;
   void PreloadCuePoint(long&, long long) const;
 
   mutable CuePoint** m_cue_points;
@@ -877,18 +958,12 @@
   long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
                  long& size);
 
-#if 0
-    //This pair parses one cluster, but only changes the state of the
-    //segment object when the cluster is actually added to the index.
-    long ParseCluster(long long& cluster_pos, long long& new_pos) const;
-    bool AddCluster(long long cluster_pos, long long new_pos);
-#endif
-
   const SeekHead* GetSeekHead() const;
   const Tracks* GetTracks() const;
   const SegmentInfo* GetInfo() const;
   const Cues* GetCues() const;
   const Chapters* GetChapters() const;
+  const Tags* GetTags() const;
 
   long long GetDuration() const;
 
@@ -914,6 +989,7 @@
   Tracks* m_pTracks;
   Cues* m_pCues;
   Chapters* m_pChapters;
+  Tags* m_pTags;
   Cluster** m_clusters;
   long m_clusterCount;  // number of entries for which m_index >= 0
   long m_clusterPreloadCount;  // number of entries for which m_index < 0

diff --git a/libvpx/third_party/libwebm/webmids.hpp b/libvpx/third_party/libwebm/webmids.hpp
index eeb52d8..6874e44 100644
--- a/libvpx/third_party/libwebm/webmids.hpp
+++ b/libvpx/third_party/libwebm/webmids.hpp

@@ -133,7 +133,13 @@
   kMkvChapterDisplay = 0x80,
   kMkvChapString = 0x85,
   kMkvChapLanguage = 0x437C,
-  kMkvChapCountry = 0x437E
+  kMkvChapCountry = 0x437E,
+  // Tags
+  kMkvTags = 0x1254C367,
+  kMkvTag = 0x7373,
+  kMkvSimpleTag = 0x67C8,
+  kMkvTagName = 0x45A3,
+  kMkvTagString = 0x4487
 };
 
 }  // end namespace mkvmuxer

diff --git a/libvpx/third_party/libyuv/README.libvpx b/libvpx/third_party/libyuv/README.libvpx
index fa5b498..09693c1 100644
--- a/libvpx/third_party/libyuv/README.libvpx
+++ b/libvpx/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1041
+Version: 1456
 License: BSD
 License File: LICENSE
 
@@ -13,4 +13,3 @@
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
-None.

diff --git a/libvpx/third_party/libyuv/include/libyuv/compare.h b/libvpx/third_party/libyuv/include/libyuv/compare.h
index 5dfac7c..08b2bb2 100644
--- a/libvpx/third_party/libyuv/include/libyuv/compare.h
+++ b/libvpx/third_party/libyuv/include/libyuv/compare.h

@@ -22,6 +22,11 @@
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
 
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a,

diff --git a/libvpx/third_party/libyuv/include/libyuv/convert.h b/libvpx/third_party/libyuv/include/libyuv/convert.h
index 1bd45c8..a8d3fa0 100644
--- a/libvpx/third_party/libyuv/include/libyuv/convert.h
+++ b/libvpx/third_party/libyuv/include/libyuv/convert.h

@@ -71,6 +71,8 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+#define J400ToJ420 I400ToI420
+
 // Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
@@ -113,15 +115,6 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert Q420 to I420.
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@@ -211,8 +204,6 @@
              int* width, int* height);
 #endif
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
-
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_y" number of bytes in a row of the dst_y plane.

diff --git a/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
index a18014c..360c6d3 100644
--- a/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/libvpx/third_party/libyuv/include/libyuv/convert_argb.h

@@ -18,7 +18,6 @@
 #include "libyuv/rotate.h"
 
 // TODO(fbarchard): This set of functions should exactly match convert.h
-// Add missing Q420.
 // TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.
@@ -69,20 +68,20 @@
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// Convert I400 (grey) to ARGB.
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// Alias.
-#define YToARGB I400ToARGB_Reference
-
-// Convert I400 to ARGB. Reverse of ARGBToI400.
+// Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height);
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
 
 // Convert NV12 to ARGB.
 LIBYUV_API
@@ -104,13 +103,6 @@
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// TODO(fbarchard): Convert Q420 to ARGB.
-// LIBYUV_API
-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
-//                const uint8* src_yuy2, int src_stride_yuy2,
-//                uint8* dst_argb, int dst_stride_argb,
-//                int width, int height);
-
 // Convert YUY2 to ARGB.
 LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@@ -123,6 +115,22 @@
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@@ -184,8 +192,6 @@
                int dst_width, int dst_height);
 #endif
 
-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
-
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.

diff --git a/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/libvpx/third_party/libyuv/include/libyuv/convert_from.h
index b1cf57f..9fd8d4d 100644
--- a/libvpx/third_party/libyuv/include/libyuv/convert_from.h
+++ b/libvpx/third_party/libyuv/include/libyuv/convert_from.h

@@ -57,7 +57,6 @@
              int width, int height);
 
 // TODO(fbarchard): I420ToM420
-// TODO(fbarchard): I420ToQ420
 
 LIBYUV_API
 int I420ToNV12(const uint8* src_y, int src_stride_y,
@@ -138,6 +137,17 @@
                  uint8* dst_frame, int dst_stride_frame,
                  int width, int height);
 
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
 LIBYUV_API
 int I420ToARGB1555(const uint8* src_y, int src_stride_y,
                    const uint8* src_u, int src_stride_u,
@@ -152,8 +162,6 @@
                    uint8* dst_frame, int dst_stride_frame,
                    int width, int height);
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.

diff --git a/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
index 90f43af..1df5320 100644
--- a/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h

@@ -61,6 +61,16 @@
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height);
 
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@@ -105,6 +115,14 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert ARGB To I411.
 LIBYUV_API
 int ARGBToI411(const uint8* src_argb, int src_stride_argb,
@@ -125,6 +143,12 @@
                uint8* dst_y, int dst_stride_y,
                int width, int height);
 
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

diff --git a/libvpx/third_party/libyuv/include/libyuv/format_conversion.h b/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
deleted file mode 100644
index b18bf05..0000000
--- a/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
+++ /dev/null

@@ -1,168 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert Bayer RGB formats to I420.
-LIBYUV_API
-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
-    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
-
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Convert I420 to Bayer RGB formats.
-LIBYUV_API
-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-// Temporary API mapper.
-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
-    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
-
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-// Convert Bayer RGB formats to ARGB.
-LIBYUV_API
-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Converts ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-// Temporary API mapper.
-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT

diff --git a/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 82fd95d..8423121 100644
--- a/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h

@@ -153,7 +153,6 @@
      int* subsample_x, int* subsample_y, int number_of_components);
 
  private:
-
   void AllocOutputBuffers(int num_outbufs);
   void DestroyOutputBuffers();
 

diff --git a/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
index d10a169..ae994db 100644
--- a/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/libvpx/third_party/libyuv/include/libyuv/planar_functions.h

@@ -45,6 +45,7 @@
                uint8* dst_y, int dst_stride_y,
                int width, int height);
 
+#define J400ToJ400 I400ToI400
 
 // Copy I422 to I422.
 #define I422ToI422 I422Copy
@@ -84,6 +85,18 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
@@ -93,6 +106,7 @@
                int width, int height);
 
 // Alias
+#define J420ToJ400 I420ToI400
 #define I420ToI420Mirror I420Mirror
 
 // I420 mirror.
@@ -387,24 +401,24 @@
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height, int interpolation);
 
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
 
-// Row functions for copying a pixels from a source with a slope to a row
+// Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width);
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width);
-#define HAS_ARGBAFFINEROW_SSE2
-#endif  // LIBYUV_DISABLE_X86
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.

diff --git a/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 0000000..c41cf32
--- /dev/null
+++ b/libvpx/third_party/libyuv/include/libyuv/rotate_row.h

@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
+#endif
+#endif
+
+// The following are available for Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 32 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_MIPS_DSPR2
+#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT

diff --git a/libvpx/third_party/libyuv/include/libyuv/row.h b/libvpx/third_party/libyuv/include/libyuv/row.h
index fdfe1ae..ebae3e7 100644
--- a/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/libvpx/third_party/libyuv/include/libyuv/row.h

@@ -15,10 +15,6 @@
 
 #include "libyuv/basic_types.h"
 
-#if defined(__native_client__)
-#include "ppapi/c/pp_macros.h"  // For PPAPI_RELEASE
-#endif
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -41,9 +37,8 @@
   free(var##_mem);  \
   var = 0
 
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR) || \
-    (defined(_MSC_VER) && defined(__clang__))
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -51,76 +46,44 @@
 #define LIBYUV_SSSE3_ONLY
 #endif
 
-// Enable for NaCL pepper 33 for bundle and AVX2 support.
-#if defined(__native_client__) && PPAPI_RELEASE >= 33
-#define NEW_BINUTILS
-#endif
-#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+#if defined(__native_client__)
 #define LIBYUV_DISABLE_NEON
 #endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
 
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Effects:
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBPOLYNOMIALROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_RGBCOLORTABLEROW_X86
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELTOPLANEROW_SSE2
-#define HAS_SOBELXROW_SSE2
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSE2
-
 // Conversions:
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
 #define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
-#define HAS_ARGBTOBAYERGGROW_SSE2
-#define HAS_ARGBTOBAYERROW_SSSE3
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
 #define HAS_ARGBTOUV422ROW_SSSE3
 #define HAS_ARGBTOUV444ROW_SSSE3
 #define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
-#define HAS_COPYROW_X86
-#define HAS_HALFROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I411TOARGBROW_SSSE3
-#define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOABGRROW_SSSE3
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
@@ -133,6 +96,8 @@
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3
@@ -149,21 +114,52 @@
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
 #define HAS_SPLITUVROW_SSE2
 #define HAS_UYVYTOARGBROW_SSSE3
 #define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
-#define HAS_YTOARGBROW_SSE2
 #define HAS_YUY2TOARGBROW_SSSE3
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
 #endif
 
-// The following are available on x64 Visual C:
-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+// The following are available on x64 Visual C and clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
 #define HAS_I422TOARGBROW_SSSE3
 #endif
 
@@ -182,37 +178,64 @@
 #endif  // __clang__
 
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
+// The following are available require VS2012.  Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
     defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-// Effects:
-#define HAS_ARGBPOLYNOMIALROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBCOPYALPHAROW_AVX2
 #define HAS_ARGBCOPYYTOALPHAROW_AVX2
-#endif
-
-// The following are require VS2012.
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
-#define HAS_HALFROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOABGRROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORAWROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
 #define HAS_UYVYTOUVROW_AVX2
 #define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
@@ -220,43 +243,23 @@
 // Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
-#endif  // defined(VISUALC_HAS_AVX2)
-
-// The following are Yasm x86 only:
-// TODO(fbarchard): Port AVX2 to inline.
-#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
-    (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__x86_64__) || defined(__i386__))
-#define HAS_MERGEUVROW_AVX2
-#define HAS_MERGEUVROW_MMX
-#define HAS_SPLITUVROW_AVX2
-#define HAS_SPLITUVROW_MMX
-#define HAS_UYVYTOYROW_AVX2
-#define HAS_UYVYTOYROW_MMX
-#define HAS_YUY2TOYROW_AVX2
-#define HAS_YUY2TOYROW_MMX
 #endif
 
 // The following are disabled when SSSE3 is available:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBBLENDROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
 
-// The following are available on arm64 platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#endif
-
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -267,23 +270,20 @@
 #define HAS_ARGB4444TOYROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTOBAYERROW_NEON
-#define HAS_ARGBTOBAYERGGROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
 #define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUV422ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
-#define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_COPYROW_NEON
-#define HAS_HALFROW_NEON
-#define HAS_I400TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
 #define HAS_I411TOARGBROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@@ -316,16 +316,18 @@
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
-#define HAS_YTOARGBROW_NEON
+#define HAS_I400TOARGBROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
 
 // Effects:
 #define HAS_ARGBADDROW_NEON
@@ -338,25 +340,25 @@
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
 #define HAS_SOBELROW_NEON
 #define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXYROW_NEON
 #define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
 #define HAS_SOBELYROW_NEON
-#define HAS_INTERPOLATEROW_NEON
-// TODO(fbarchard): Investigate neon unittest failure.
-// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
 #endif
 
 // The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_I422TOABGRROW_MIPS_DSPR2
 #define HAS_I422TOARGBROW_MIPS_DSPR2
 #define HAS_I422TOBGRAROW_MIPS_DSPR2
-#define HAS_INTERPOLATEROWS_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
 #define HAS_MIRRORROW_MIPS_DSPR2
 #define HAS_MIRRORUVROW_MIPS_DSPR2
 #define HAS_SPLITUVROW_MIPS_DSPR2
@@ -365,6 +367,7 @@
 
 #if defined(_MSC_VER) && !defined(__CLR_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
 typedef __declspec(align(16)) int16 vec16[8];
 typedef __declspec(align(16)) int32 vec32[4];
 typedef __declspec(align(16)) int8 vec8[16];
@@ -377,24 +380,37 @@
 typedef __declspec(align(32)) uint16 ulvec16[16];
 typedef __declspec(align(32)) uint32 ulvec32[8];
 typedef __declspec(align(32)) uint8 ulvec8[32];
-
 #elif defined(__GNUC__)
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
 typedef int16 __attribute__((vector_size(16))) vec16;
 typedef int32 __attribute__((vector_size(16))) vec32;
 typedef int8 __attribute__((vector_size(16))) vec8;
 typedef uint16 __attribute__((vector_size(16))) uvec16;
 typedef uint32 __attribute__((vector_size(16))) uvec32;
 typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
 typedef int16 vec16[8];
 typedef int32 vec32[4];
 typedef int8 vec8[16];
 typedef uint16 uvec16[8];
 typedef uint32 uvec32[4];
 typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
 #endif
 
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
@@ -404,24 +420,16 @@
 #endif
 
 // NaCL macros for GCC x86 and x64.
-
-// TODO(nfullagar): When pepper_33 toolchain is distributed, default to
-// NEW_BINUTILS and remove all BUNDLEALIGN occurances.
 #if defined(__native_client__)
 #define LABELALIGN ".p2align 5\n"
 #else
-#define LABELALIGN ".p2align 2\n"
+#define LABELALIGN
 #endif
 #if defined(__native_client__) && defined(__x86_64__)
-#if defined(NEW_BINUTILS)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
 #define BUNDLELOCK ".bundle_lock\n"
 #define BUNDLEUNLOCK ".bundle_unlock\n"
-#define BUNDLEALIGN "\n"
-#else
-#define BUNDLELOCK "\n"
-#define BUNDLEUNLOCK "\n"
-#define BUNDLEALIGN ".p2align 5\n"
-#endif
 #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
 #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
 #define MEMLEA(offset, base) #offset "(%q" #base ")"
@@ -446,8 +454,19 @@
     "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
     #opcode " (%%r15,%%r14),%" #arg "\n" \
     BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
 #else  // defined(__native_client__) && defined(__x86_64__)
-#define BUNDLEALIGN "\n"
+#define NACL_R14
+#define BUNDLEALIGN
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMACCESS2(offset, base) #offset "(%" #base ")"
 #define MEMLEA(offset, base) #offset "(%" #base ")"
@@ -463,14 +482,19 @@
     #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #define MEMOPARG(opcode, offset, base, index, scale, arg) \
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #endif  // defined(__native_client__) && defined(__x86_64__)
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 #undef MEMACCESS
 #if defined(__native_client__)
-#define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
 #else
-#define MEMACCESS(base) "\n"
+#define MEMACCESS(base)
 #endif
 #endif
 
@@ -563,13 +587,6 @@
 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
 void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
 void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
@@ -648,16 +665,6 @@
                        uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
                        uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
-                                  uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                                 uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
                            uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -719,15 +726,11 @@
 
 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
-                                    uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
                               uint8* dst_u, uint8* dst_v, int width);
 
 void ARGBToUV422Row_SSSE3(const uint8* src_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
-                                    uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
                               uint8* dst_u, uint8* dst_v, int width);
 
@@ -737,6 +740,8 @@
                       uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV411Row_C(const uint8* src_argb,
                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
 
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
@@ -744,6 +749,10 @@
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 
 void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                        int width);
@@ -755,9 +764,12 @@
                    int width);
 
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 
 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
@@ -765,10 +777,6 @@
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int pix);
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix);
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
-                                     uint8* dst_v, int pix);
 void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                          int pix);
 void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -786,8 +794,6 @@
                      int width);
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width);
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width);
 void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                          int width);
 void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
@@ -796,11 +802,14 @@
                          int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
 
 void CopyRow_16_C(const uint16* src, uint16* dst, int count);
 
@@ -812,15 +821,17 @@
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
 
-void SetRow_X86(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                     int dst_stride, int height);
-void SetRow_NEON(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height);
-void SetRow_C(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
-                   int height);
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
 
 // ARGBShufflers for BGRAToARGB etc.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
@@ -833,8 +844,6 @@
                          const uint8* shuffler, int pix);
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix);
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
                              const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
@@ -851,6 +860,11 @@
                             int pix);
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
 
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
@@ -866,12 +880,20 @@
 void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+
 void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                               int pix);
 void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                                 int pix);
 void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                                 int pix);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
 void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
@@ -887,11 +909,24 @@
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
 
 void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -900,12 +935,13 @@
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 
 void I444ToARGBRow_C(const uint8* src_y,
                      const uint8* src_u,
@@ -944,6 +980,11 @@
 void UYVYToARGBRow_C(const uint8* src_uyvy,
                      uint8* dst_argb,
                      int width);
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
 void I422ToBGRARow_C(const uint8* src_y,
                      const uint8* src_u,
                      const uint8* src_v,
@@ -984,19 +1025,36 @@
                        const uint8* src_v,
                        uint8* dst_rgb565,
                        int width);
-void YToARGBRow_C(const uint8* src_y,
-                  uint8* dst_argb,
-                  int width);
 void I422ToARGBRow_AVX2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
 void I444ToARGBRow_SSSE3(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,
                          uint8* dst_argb,
                          int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
 void I422ToARGBRow_SSSE3(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,
@@ -1007,6 +1065,11 @@
                          const uint8* src_v,
                          uint8* dst_argb,
                          int width);
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
 void NV12ToARGBRow_SSSE3(const uint8* src_y,
                          const uint8* src_uv,
                          uint8* dst_argb,
@@ -1015,6 +1078,14 @@
                          const uint8* src_vu,
                          uint8* dst_argb,
                          int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_uv,
                            uint8* dst_argb,
@@ -1023,12 +1094,36 @@
                            const uint8* src_vu,
                            uint8* dst_argb,
                            int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_argb,
+                          int width);
 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
                          uint8* dst_argb,
                          int width);
 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
                          uint8* dst_argb,
                          int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void J422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
 void I422ToBGRARow_SSSE3(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,
@@ -1049,82 +1144,81 @@
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          int width);
 void I422ToRGB24Row_SSSE3(const uint8* src_y,
                           const uint8* src_u,
                           const uint8* src_v,
                           uint8* dst_rgb24,
                           int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
 void I422ToRAWRow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_raw,
                         int width);
-
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_uv,
-                                   uint8* dst_argb,
-                                   int width);
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_vu,
-                                   uint8* dst_argb,
-                                   int width);
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
-                                   uint8* dst_argb,
-                                   int width);
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
-                                   uint8* dst_argb,
-                                   int width);
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_bgra,
-                                   int width);
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_abgr,
-                                   int width);
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_rgba,
-                                   int width);
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
 void I422ToARGBRow_Any_AVX2(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1135,6 +1229,11 @@
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_uv,
                              uint8* dst_argb,
@@ -1143,6 +1242,14 @@
                              const uint8* src_vu,
                              uint8* dst_argb,
                              int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
                                const uint8* src_uv,
                                uint8* dst_argb,
@@ -1151,12 +1258,36 @@
                                const uint8* src_vu,
                                uint8* dst_argb,
                                int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_vu,
+                              uint8* dst_argb,
+                              int width);
 void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
                              uint8* dst_argb,
                              int width);
 void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
                              uint8* dst_argb,
                              int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void J422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1177,39 +1308,59 @@
                                  const uint8* src_v,
                                  uint8* dst_rgba,
                                  int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
 void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
                                  const uint8* src_u,
                                  const uint8* src_v,
                                  uint8* dst_rgba,
                                  int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
 void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
                                const uint8* src_u,
                                const uint8* src_v,
                                uint8* dst_rgba,
                                int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              int width);
 void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
                               uint8* dst_argb,
                               int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
 void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
-void YToARGBRow_SSE2(const uint8* src_y,
-                     uint8* dst_argb,
-                     int width);
-void YToARGBRow_NEON(const uint8* src_y,
-                     uint8* dst_argb,
-                     int width);
-void YToARGBRow_Any_SSE2(const uint8* src_y,
-                         uint8* dst_argb,
-                         int width);
-void YToARGBRow_Any_NEON(const uint8* src_y,
-                         uint8* dst_argb,
-                         int width);
+void I422ToRAWRow_Any_AVX2(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
 
 // ARGB preattenuated alpha blend.
 void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@@ -1277,11 +1428,22 @@
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
 
 void I444ToARGBRow_Any_NEON(const uint8* src_y,
                             const uint8* src_u,
@@ -1401,12 +1563,6 @@
                       uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix);
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix);
@@ -1442,12 +1598,6 @@
                       uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix);
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix);
@@ -1480,39 +1630,6 @@
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
                              uint8* dst_u, uint8* dst_v, int pix);
 
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-               uint8* dst_uv, int pix);
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                  uint16* dst_uv, int pix);
-
-void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
-                      uint32 selector, int pix);
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix);
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix);
-void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                              uint32 selector, int pix);
-void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
-                             uint32 selector, int pix);
-void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
-                        uint32 /* selector */, int pix);
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 /* selector */, int pix);
-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 /* selector */, int pix);
-void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
-                               uint32 /* selector */, int pix);
-void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
-                               uint32 /* selector */, int pix);
-
 void I422ToYUY2Row_C(const uint8* src_y,
                      const uint8* src_u,
                      const uint8* src_v,
@@ -1648,15 +1765,9 @@
 void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride_ptr, int width,
                          int source_y_fraction);
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                ptrdiff_t src_stride_ptr, int width,
-                                int source_y_fraction);
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride_ptr, int width,
-                                   int source_y_fraction);
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride_ptr, int width,
-                                    int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride_ptr, int width,
+                               int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
@@ -1669,9 +1780,9 @@
 void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
-void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride_ptr, int width,
-                                    int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
 
 void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
                          ptrdiff_t src_stride_ptr,
@@ -1708,6 +1819,18 @@
                      uint8* dst_argb, int width);
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
 
 void ARGBPolynomialRow_C(const uint8* src_argb,
                          uint8* dst_argb, const float* poly,

diff --git a/libvpx/third_party/libyuv/include/libyuv/scale.h b/libvpx/third_party/libyuv/include/libyuv/scale.h
index a3bc07e..102158d 100644
--- a/libvpx/third_party/libyuv/include/libyuv/scale.h
+++ b/libvpx/third_party/libyuv/include/libyuv/scale.h

@@ -34,6 +34,7 @@
                 int dst_width, int dst_height,
                 enum FilterMode filtering);
 
+LIBYUV_API
 void ScalePlane_16(const uint16* src, int src_stride,
                    int src_width, int src_height,
                    uint16* dst, int dst_stride,

diff --git a/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index 8dc0762..94ad9cf 100644
--- a/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/libvpx/third_party/libyuv/include/libyuv/scale_row.h

@@ -12,45 +12,66 @@
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 
 #include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
 
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SCALEROWDOWN2_SSE2
-#define HAS_SCALEROWDOWN4_SSE2
-#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEADDROWS_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALECOLSUP2_SSE2
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
 #define HAS_SCALEARGBROWDOWN2_SSE2
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_FIXEDDIV_X86
-#define HAS_FIXEDDIV1_X86
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSE2
+#endif
+
+// The following are available on VS2012:
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
 #endif
 
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
 #define HAS_SCALEROWDOWN2_NEON
-#define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
 #endif
 
 // The following are available on Mips platforms:
@@ -164,10 +185,8 @@
                             uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
                                uint16* dst_ptr, int dst_width);
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height);
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                       uint32* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                          ptrdiff_t src_stride,
                          uint8* dst_argb, int dst_width);
@@ -194,25 +213,28 @@
 void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx);
 
+// Specialized scalers for x86.
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@@ -229,46 +251,124 @@
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width);
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width,
-                       int src_height);
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                            int dst_width, int x, int dx);
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                        int dst_width, int x, int dx);
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
+
+
+// ARGB Column functions
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                         int dst_width, int x, int dx);
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx);
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx);
-// Row functions.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
-                                  int src_stepx,
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                             uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@@ -276,7 +376,8 @@
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst, int dst_width);
-
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width);
 
@@ -311,6 +412,42 @@
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width);
 
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+
 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
 void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

diff --git a/libvpx/third_party/libyuv/include/libyuv/version.h b/libvpx/third_party/libyuv/include/libyuv/version.h
index 912c4c9..9d1d746 100644
--- a/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/libvpx/third_party/libyuv/include/libyuv/version.h

@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1041
+#define LIBYUV_VERSION 1456
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT

diff --git a/libvpx/third_party/libyuv/include/libyuv/video_common.h b/libvpx/third_party/libyuv/include/libyuv/video_common.h
index 91acc2f..cb6582f 100644
--- a/libvpx/third_party/libyuv/include/libyuv/video_common.h
+++ b/libvpx/third_party/libyuv/include/libyuv/video_common.h

@@ -62,7 +62,7 @@
 
   // 2 Secondary YUV formats: row biplanar.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
 
   // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -75,7 +75,7 @@
   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
 
-  // 4 Secondary RGB formats: 4 Bayer Patterns.
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
   FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
   FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
   FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),

diff --git a/libvpx/third_party/libyuv/source/compare.cc b/libvpx/third_party/libyuv/source/compare.cc
index 9ea81b4..46aa847 100644
--- a/libvpx/third_party/libyuv/source/compare.cc
+++ b/libvpx/third_party/libyuv/source/compare.cc

@@ -19,6 +19,7 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -36,7 +37,7 @@
 #define HAS_HASHDJB2_SSE41
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
 
-#if _MSC_VER >= 1700
+#ifdef VISUALC_HAS_AVX2
 #define HAS_HASHDJB2_AVX2
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
 #endif
@@ -78,9 +79,57 @@
   return seed;
 }
 
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #endif
@@ -89,8 +138,8 @@
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
-// Visual C 2012 required for AVX2.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+
+#ifdef VISUALC_HAS_AVX2
 #define HAS_SUMSQUAREERROR_AVX2
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
 #endif
@@ -114,8 +163,7 @@
   }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }

diff --git a/libvpx/third_party/libyuv/source/compare_posix.cc b/libvpx/third_party/libyuv/source/compare_gcc.cc
similarity index 95%
rename from libvpx/third_party/libyuv/source/compare_posix.cc
rename to libvpx/third_party/libyuv/source/compare_gcc.cc
index ac36119..247cb33 100644
--- a/libvpx/third_party/libyuv/source/compare_posix.cc
+++ b/libvpx/third_party/libyuv/source/compare_gcc.cc

@@ -25,11 +25,10 @@
     "pxor      %%xmm5,%%xmm5                   \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm1,%%xmm3                   \n"
     "psubusb   %%xmm2,%%xmm1                   \n"
     "psubusb   %%xmm3,%%xmm2                   \n"
@@ -41,6 +40,7 @@
     "pmaddwd   %%xmm2,%%xmm2                   \n"
     "paddd     %%xmm1,%%xmm0                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
 
     "pshufd    $0xee,%%xmm0,%%xmm1             \n"
@@ -53,11 +53,7 @@
     "+r"(src_b),      // %1
     "+r"(count),      // %2
     "=g"(sse)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );  // NOLINT
   return sse;
 }
@@ -124,13 +120,13 @@
     "pmulld    %%xmm5,%%xmm1                   \n"
     "paddd     %%xmm4,%%xmm3                   \n"
     "paddd     %%xmm2,%%xmm1                   \n"
-    "sub       $0x10,%1                        \n"
     "paddd     %%xmm3,%%xmm1                   \n"
     "pshufd    $0xe,%%xmm1,%%xmm2              \n"
     "paddd     %%xmm2,%%xmm1                   \n"
     "pshufd    $0x1,%%xmm1,%%xmm2              \n"
     "paddd     %%xmm2,%%xmm1                   \n"
     "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
     "jg        1b                              \n"
     "movd      %%xmm0,%3                       \n"
   : "+r"(src),        // %0
@@ -143,9 +139,7 @@
     "m"(kHashMul2),   // %7
     "m"(kHashMul3)    // %8
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );  // NOLINT
   return hash;
 }

diff --git a/libvpx/third_party/libyuv/source/compare_neon.cc b/libvpx/third_party/libyuv/source/compare_neon.cc
index 5e7b8e4..ef006ec 100644
--- a/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/libvpx/third_party/libyuv/source/compare_neon.cc

@@ -16,7 +16,8 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   volatile uint32 sse;
@@ -56,7 +57,7 @@
   return sse;
 }
 
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/compare_neon64.cc b/libvpx/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000..6d1e5e1
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/compare_neon64.cc

@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/compare_win.cc b/libvpx/third_party/libyuv/source/compare_win.cc
index 9983165..19806f2 100644
--- a/libvpx/third_party/libyuv/source/compare_win.cc
+++ b/libvpx/third_party/libyuv/source/compare_win.cc

@@ -16,9 +16,11 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
@@ -27,13 +29,11 @@
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
-    align      4
   wloop:
-    movdqa     xmm1, [eax]
+    movdqu     xmm1, [eax]
     lea        eax,  [eax + 16]
-    movdqa     xmm2, [edx]
+    movdqu     xmm2, [edx]
     lea        edx,  [edx + 16]
-    sub        ecx, 16
     movdqa     xmm3, xmm1  // abs trick
     psubusb    xmm1, xmm2
     psubusb    xmm2, xmm3
@@ -45,6 +45,7 @@
     pmaddwd    xmm2, xmm2
     paddd      xmm0, xmm1
     paddd      xmm0, xmm2
+    sub        ecx, 16
     jg         wloop
 
     pshufd     xmm1, xmm0, 0xee
@@ -60,7 +61,7 @@
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
@@ -70,12 +71,10 @@
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
 
-    align      4
   wloop:
     vmovdqu    ymm1, [eax]
     vmovdqu    ymm2, [eax + edx]
     lea        eax,  [eax + 32]
-    sub        ecx, 32
     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
     vpsubusb   ymm2, ymm2, ymm1
     vpor       ymm1, ymm2, ymm3
@@ -85,6 +84,7 @@
     vpmaddwd   ymm1, ymm1, ymm1
     vpaddd     ymm0, ymm0, ymm1
     vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
     jg         wloop
 
     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
@@ -135,7 +135,7 @@
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
     _asm _emit 0x40 _asm _emit reg
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
@@ -145,7 +145,6 @@
     pxor       xmm7, xmm7        // constant 0 for unpck
     movdqa     xmm6, kHash16x33
 
-    align      4
   wloop:
     movdqu     xmm1, [eax]       // src[0-15]
     lea        eax, [eax + 16]
@@ -170,7 +169,6 @@
     pmulld(0xcd)                 // pmulld     xmm1, xmm5
     paddd      xmm3, xmm4        // add 16 results
     paddd      xmm1, xmm2
-    sub        ecx, 16
     paddd      xmm1, xmm3
 
     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
@@ -178,6 +176,7 @@
     pshufd     xmm2, xmm1, 0x01
     paddd      xmm1, xmm2
     paddd      xmm0, xmm1
+    sub        ecx, 16
     jg         wloop
 
     movd       eax, xmm0         // return hash
@@ -187,7 +186,7 @@
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
   __asm {
     mov        eax, [esp + 4]    // src
@@ -195,7 +194,6 @@
     movd       xmm0, [esp + 12]  // seed
     movdqa     xmm6, kHash16x33
 
-    align      4
   wloop:
     vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
@@ -209,13 +207,13 @@
     pmulld     xmm1, kHashMul3
     paddd      xmm3, xmm4        // add 16 results
     paddd      xmm1, xmm2
-    sub        ecx, 16
     paddd      xmm1, xmm3
     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
     paddd      xmm1, xmm2
     pshufd     xmm2, xmm1, 0x01
     paddd      xmm1, xmm2
     paddd      xmm0, xmm1
+    sub        ecx, 16
     jg         wloop
 
     movd       eax, xmm0         // return hash
@@ -223,8 +221,7 @@
   }
 }
 #endif  // _MSC_VER >= 1700
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/convert.cc b/libvpx/third_party/libyuv/source/convert.cc
index 874a6cb..3ad6bd7 100644
--- a/libvpx/third_party/libyuv/source/convert.cc
+++ b/libvpx/third_party/libyuv/source/convert.cc

@@ -188,17 +188,14 @@
                        int width, int height) {
   int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src, 16) &&
-      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -207,8 +204,8 @@
   }
 #endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
 #if defined(HAS_COPYROW_MIPS)
@@ -283,20 +280,15 @@
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     SplitUVRow = SplitUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
-          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-        SplitUVRow = SplitUVRow_SSE2;
-      }
+      SplitUVRow = SplitUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     SplitUVRow = SplitUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       SplitUVRow = SplitUVRow_AVX2;
@@ -304,7 +296,7 @@
   }
 #endif
 #if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     SplitUVRow = SplitUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       SplitUVRow = SplitUVRow_NEON;
@@ -312,15 +304,13 @@
   }
 #endif
 #if defined(HAS_SPLITUVROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
     SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
-      if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
-          IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-          IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-        SplitUVRow = SplitUVRow_MIPS_DSPR2;
-      }
+      SplitUVRow = SplitUVRow_MIPS_DSPR2;
     }
   }
 #endif
@@ -391,125 +381,6 @@
                     width, height);
 }
 
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  int halfheight = (height + 1) >> 1;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-      int pix) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
-      YUY2ToYRow_C;
-  if (!src_y || !src_yuy2 ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    CopyRow(src_y, dst_y, width);
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-  }
-  return 0;
-}
-
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
@@ -529,23 +400,17 @@
     src_stride_yuy2 = -src_stride_yuy2;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
     YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -555,11 +420,9 @@
   }
 #endif
 #if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-    }
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUVRow = YUY2ToUVRow_NEON;
@@ -602,23 +465,17 @@
     src_stride_uyvy = -src_stride_uyvy;
   }
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     UYVYToUVRow = UYVYToUVRow_Any_SSE2;
     UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUVRow = UYVYToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     UYVYToUVRow = UYVYToUVRow_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -628,11 +485,9 @@
   }
 #endif
 #if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_NEON;
-    }
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUVRow = UYVYToUVRow_NEON;
@@ -680,23 +535,17 @@
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -706,16 +555,18 @@
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -759,34 +610,31 @@
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-#if defined(HAS_BGRATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
     BGRAToYRow = BGRAToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
-      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
-        BGRAToUVRow = BGRAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          BGRAToYRow = BGRAToYRow_SSSE3;
-        }
-      }
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
     }
   }
-#elif defined(HAS_BGRATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     BGRAToYRow = BGRAToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       BGRAToYRow = BGRAToYRow_NEON;
     }
-    if (width >= 16) {
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
       BGRAToUVRow = BGRAToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         BGRAToUVRow = BGRAToUVRow_NEON;
       }
     }
-  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -828,32 +676,29 @@
     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
-#if defined(HAS_ABGRTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
     ABGRToYRow = ABGRToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
-      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
-        ABGRToUVRow = ABGRToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ABGRToYRow = ABGRToYRow_SSSE3;
-        }
-      }
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ABGRToYRow = ABGRToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ABGRToYRow = ABGRToYRow_NEON;
     }
-    if (width >= 16) {
-      ABGRToUVRow = ABGRToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ABGRToUVRow = ABGRToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
     }
   }
 #endif
@@ -897,32 +742,29 @@
     src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
-#if defined(HAS_RGBATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
     RGBAToYRow = RGBAToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
-      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
-        RGBAToUVRow = RGBAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          RGBAToYRow = RGBAToYRow_SSSE3;
-        }
-      }
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
     }
   }
-#elif defined(HAS_RGBATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGBAToYRow = RGBAToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGBAToYRow = RGBAToYRow_NEON;
     }
-    if (width >= 16) {
-      RGBAToUVRow = RGBAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGBAToUVRow = RGBAToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
     }
   }
 #endif
@@ -963,9 +805,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -978,79 +817,84 @@
     src_stride_rgb24 = -src_stride_rgb24;
   }
 
+// Neon version does direct RGB24 to YUV.
 #if defined(HAS_RGB24TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
     RGB24ToYRow = RGB24ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB24ToYRow = RGB24ToYRow_NEON;
-    }
-    if (width >= 16) {
-      RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         RGB24ToUVRow = RGB24ToUVRow_NEON;
       }
     }
   }
-#else  // HAS_RGB24TOYROW_NEON
-
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_RGB24TOYROW_NEON
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
 
-  for (y = 0; y < height - 1; y += 2) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
-    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
 #if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
 #else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
 #endif
-  }
+    }
 #if !defined(HAS_RGB24TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
+  }
 #endif
   return 0;
 }
@@ -1075,9 +919,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_raw || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1090,79 +931,84 @@
     src_stride_raw = -src_stride_raw;
   }
 
+// Neon version does direct RAW to YUV.
 #if defined(HAS_RAWTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
     RAWToYRow = RAWToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RAWToYRow = RAWToYRow_NEON;
-    }
-    if (width >= 16) {
-      RAWToUVRow = RAWToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         RAWToUVRow = RAWToUVRow_NEON;
       }
     }
   }
-#else  // HAS_RAWTOYROW_NEON
-
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RAWToARGBRow = RAWToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_RAWTOYROW_NEON
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
 
-  for (y = 0; y < height - 1; y += 2) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
-    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
-    RAWToARGBRow(src_raw, row, width);
-    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-    src_raw += src_stride_raw * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
 #if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
 #else
-    RAWToARGBRow(src_raw, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
 #endif
-  }
+    }
 #if !defined(HAS_RAWTOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
+  }
 #endif
   return 0;
 }
@@ -1187,9 +1033,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1202,79 +1045,92 @@
     src_stride_rgb565 = -src_stride_rgb565;
   }
 
+// Neon version does direct RGB565 to YUV.
 #if defined(HAS_RGB565TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
     RGB565ToYRow = RGB565ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToYRow = RGB565ToYRow_NEON;
-    }
-    if (width >= 16) {
-      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         RGB565ToUVRow = RGB565ToUVRow_NEON;
       }
     }
   }
-#else  // HAS_RGB565TOYROW_NEON
-
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_RGB565TOYROW_NEON
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
 
-  for (y = 0; y < height - 1; y += 2) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
-    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
 #else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-    src_rgb565 += src_stride_rgb565 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
 #if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
 #else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
 #endif
-  }
+    }
 #if !defined(HAS_RGB565TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
+  }
 #endif
   return 0;
 }
@@ -1299,9 +1155,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1314,81 +1167,94 @@
     src_stride_argb1555 = -src_stride_argb1555;
   }
 
+// Neon version does direct ARGB1555 to YUV.
 #if defined(HAS_ARGB1555TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
     ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToYRow = ARGB1555ToYRow_NEON;
-    }
-    if (width >= 16) {
-      ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
       }
     }
   }
-#else  // HAS_ARGB1555TOYROW_NEON
-
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_ARGB1555TOYROW_NEON
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
 
-  for (y = 0; y < height - 1; y += 2) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
-    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                   width);
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
 #else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-    src_argb1555 += src_stride_argb1555 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
 #else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
 #endif
-  }
+    }
 #if !defined(HAS_ARGB1555TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
+  }
 #endif
   return 0;
 }
@@ -1413,9 +1279,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1428,81 +1291,94 @@
     src_stride_argb4444 = -src_stride_argb4444;
   }
 
+// Neon version does direct ARGB4444 to YUV.
 #if defined(HAS_ARGB4444TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
     ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToYRow = ARGB4444ToYRow_NEON;
-    }
-    if (width >= 16) {
-      ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
       }
     }
   }
-#else  // HAS_ARGB4444TOYROW_NEON
-
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_ARGB4444TOYROW_NEON
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
 
-  for (y = 0; y < height - 1; y += 2) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
-    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                   width);
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
 #else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-    src_argb4444 += src_stride_argb4444 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
 #if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
 #else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
 #endif
-  }
+    }
 #if !defined(HAS_ARGB4444TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
+  }
 #endif
   return 0;
 }

diff --git a/libvpx/third_party/libyuv/source/convert_argb.cc b/libvpx/third_party/libyuv/source/convert_argb.cc
index ac0bc3d..44756bc 100644
--- a/libvpx/third_party/libyuv/source/convert_argb.cc
+++ b/libvpx/third_party/libyuv/source/convert_argb.cc

@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@@ -79,17 +78,23 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I444ToARGBRow = I444ToARGBRow_SSSE3;
-      }
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I444ToARGBRow = I444ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I444ToARGBRow = I444ToARGBRow_NEON;
@@ -141,18 +146,15 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -160,7 +162,7 @@
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -221,17 +223,23 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I411ToARGBRow = I411ToARGBRow_SSSE3;
-      }
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_I411TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I411ToARGBRow = I411ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I411ToARGBRow = I411ToARGBRow_NEON;
@@ -251,13 +259,13 @@
 
 // Convert I400 to ARGB.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
-                         int width, int height) {
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   int y;
-  void (*YToARGBRow)(const uint8* y_buf,
+  void (*I400ToARGBRow)(const uint8* y_buf,
                      uint8* rgb_buf,
-                     int width) = YToARGBRow_C;
+                     int width) = I400ToARGBRow_C;
   if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -275,39 +283,47 @@
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
-#if defined(HAS_YTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    YToARGBRow = YToARGBRow_Any_SSE2;
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
-      YToARGBRow = YToARGBRow_SSE2;
+      I400ToARGBRow = I400ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_YTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    YToARGBRow = YToARGBRow_Any_NEON;
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      YToARGBRow = YToARGBRow_NEON;
+      I400ToARGBRow = I400ToARGBRow_NEON;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    YToARGBRow(src_y, dst_argb, width);
+    I400ToARGBRow(src_y, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
   }
   return 0;
 }
 
-// Convert I400 to ARGB.
+// Convert J400 to ARGB.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
+int J400ToARGB(const uint8* src_y, int src_stride_y,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
-      I400ToARGBRow_C;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      J400ToARGBRow_C;
   if (!src_y || !dst_argb ||
       width <= 0 || height == 0) {
     return -1;
@@ -325,26 +341,32 @@
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
-#if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I400ToARGBRow = I400ToARGBRow_SSE2;
-      }
+      J400ToARGBRow = J400ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_I400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_NEON;
+      J400ToARGBRow = J400ToARGBRow_NEON;
     }
   }
 #endif
   for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
+    J400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
     dst_argb += dst_stride_argb;
   }
@@ -447,15 +469,15 @@
     src_stride_rgb24 = dst_stride_argb = 0;
   }
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB24ToARGBRow = RGB24ToARGBRow_NEON;
@@ -497,15 +519,15 @@
     src_stride_raw = dst_stride_argb = 0;
   }
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RAWToARGBRow = RAWToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RAWToARGBRow = RAWToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RAWToARGBRow = RAWToARGBRow_NEON;
@@ -547,15 +569,23 @@
     src_stride_rgb565 = dst_stride_argb = 0;
   }
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_NEON;
@@ -597,15 +627,23 @@
     src_stride_argb1555 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_ARGB1555TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
@@ -647,15 +685,23 @@
     src_stride_argb4444 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_ARGB4444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
@@ -693,17 +739,23 @@
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -744,18 +796,23 @@
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-      }
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
     }
   }
 #endif
 #if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV21ToARGBRow = NV21ToARGBRow_NEON;
@@ -795,17 +852,23 @@
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -852,19 +915,23 @@
     src_stride_yuy2 = dst_stride_argb = 0;
   }
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
-  // Posix is 16, Windows is 8.
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
-      }
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_YUY2TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       YUY2ToARGBRow = YUY2ToARGBRow_NEON;
@@ -905,19 +972,23 @@
     src_stride_uyvy = dst_stride_argb = 0;
   }
 #if defined(HAS_UYVYTOARGBROW_SSSE3)
-  // Posix is 16, Windows is 8.
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        UYVYToARGBRow = UYVYToARGBRow_SSSE3;
-      }
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_UYVYTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       UYVYToARGBRow = UYVYToARGBRow_NEON;
@@ -932,6 +1003,152 @@
   return 0;
 }
 
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

diff --git a/libvpx/third_party/libyuv/source/convert_from.cc b/libvpx/third_party/libyuv/source/convert_from.cc
index c1a2f62..31f1ac9 100644
--- a/libvpx/third_party/libyuv/source/convert_from.cc
+++ b/libvpx/third_party/libyuv/source/convert_from.cc

@@ -13,7 +13,6 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/scale.h"  // For ScalePlane()
@@ -174,14 +173,15 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
   }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -220,14 +220,15 @@
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -280,14 +281,15 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
   }
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -326,14 +328,15 @@
     dst_stride_uyvy = -dst_stride_uyvy;
   }
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -397,20 +400,15 @@
     src_stride_u = src_stride_v = dst_stride_uv = 0;
   }
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -418,7 +416,7 @@
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
@@ -476,18 +474,15 @@
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -495,7 +490,7 @@
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -548,23 +543,30 @@
     dst_stride_bgra = -dst_stride_bgra;
   }
 #if defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToBGRARow = I422ToBGRARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToBGRARow = I422ToBGRARow_NEON;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -610,17 +612,23 @@
     dst_stride_abgr = -dst_stride_abgr;
   }
 #if defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
-      }
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
     }
   }
-#elif defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToABGRRow = I422ToABGRRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToABGRRow = I422ToABGRRow_NEON;
@@ -664,17 +672,23 @@
     dst_stride_rgba = -dst_stride_rgba;
   }
 #if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
     }
   }
-#elif defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGBARow = I422ToRGBARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGBARow = I422ToRGBARow_NEON;
@@ -718,14 +732,23 @@
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB24Row = I422ToRGB24Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB24Row = I422ToRGB24Row_NEON;
@@ -769,14 +792,23 @@
     dst_stride_raw = -dst_stride_raw;
   }
 #if defined(HAS_I422TORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRAWRow = I422ToRAWRow_SSSE3;
     }
   }
-#elif defined(HAS_I422TORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRAWRow = I422ToRAWRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRAWRow = I422ToRAWRow_NEON;
@@ -820,14 +852,23 @@
     dst_stride_argb1555 = -dst_stride_argb1555;
   }
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB1555Row = I422ToARGB1555Row_NEON;
@@ -872,14 +913,23 @@
     dst_stride_argb4444 = -dst_stride_argb4444;
   }
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB4444Row = I422ToARGB4444Row_NEON;
@@ -923,14 +973,23 @@
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB565Row = I422ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB565Row = I422ToRGB565Row_NEON;
@@ -950,6 +1009,117 @@
   return 0;
 }
 
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
 // Convert I420 to specified format
 LIBYUV_API
 int ConvertFromI420(const uint8* y, int y_stride,
@@ -1054,38 +1224,6 @@
                      dst_sample_stride ? dst_sample_stride : width * 4,
                      width, height);
       break;
-    case FOURCC_BGGR:
-      r = I420ToBayerBGGR(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GBRG:
-      r = I420ToBayerGBRG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GRBG:
-      r = I420ToBayerGRBG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_RGGB:
-      r = I420ToBayerRGGB(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
     case FOURCC_I400:
       r = I400Copy(y, y_stride,
                    dst_sample,
@@ -1116,7 +1254,7 @@
                      width, height);
       break;
     }
-    // TODO(fbarchard): Add M420 and Q420.
+    // TODO(fbarchard): Add M420.
     // Triplanar formats
     // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:

diff --git a/libvpx/third_party/libyuv/source/convert_from_argb.cc b/libvpx/third_party/libyuv/source/convert_from_argb.cc
index 121a416..8d1e97a 100644
--- a/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/libvpx/third_party/libyuv/source/convert_from_argb.cc

@@ -12,7 +12,6 @@
 
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/row.h"
 
@@ -51,38 +50,45 @@
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    if (TestCpuFlag(kCpuHasSSSE3)) {
       ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
       if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
-        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-        }
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
       }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
-    }
-  }
-
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
       ARGBToUV444Row = ARGBToUV444Row_NEON;
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -125,40 +131,43 @@
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
     }
   }
 #endif
-
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -203,19 +212,15 @@
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -223,16 +228,18 @@
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 32) {
-      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-      if (IS_ALIGNED(width, 32)) {
-        ARGBToUV411Row = ARGBToUV411Row_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
     }
   }
 #endif
@@ -261,9 +268,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -276,47 +280,51 @@
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -324,29 +332,34 @@
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -364,9 +377,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -379,47 +389,51 @@
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -427,29 +441,34 @@
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -484,50 +503,56 @@
     src_stride_argb = dst_stride_yuy2 = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -585,50 +610,56 @@
     src_stride_argb = dst_stride_uyvy = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -679,19 +710,15 @@
     src_stride_argb = dst_stride_y = 0;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -699,7 +726,7 @@
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -755,14 +782,15 @@
     src_stride_argb = dst_stride_rgb24 = 0;
   }
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRGB24Row = ARGBToRGB24Row_NEON;
@@ -802,14 +830,15 @@
     src_stride_argb = dst_stride_raw = 0;
   }
 #if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRAWRow = ARGBToRAWRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRAWRow = ARGBToRAWRow_NEON;
@@ -825,7 +854,68 @@
   return 0;
 }
 
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
 // Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_rgb565, int dst_stride_rgb565,
@@ -849,15 +939,23 @@
     src_stride_argb = dst_stride_rgb565 = 0;
   }
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRGB565Row = ARGBToRGB565Row_NEON;
@@ -897,15 +995,23 @@
     src_stride_argb = dst_stride_argb1555 = 0;
   }
 #if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
@@ -945,15 +1051,23 @@
     src_stride_argb = dst_stride_argb4444 = 0;
   }
 #if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
@@ -978,7 +1092,7 @@
                int width, int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
       ARGBToYJRow_C;
   if (!src_argb ||
@@ -993,23 +1107,17 @@
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-        if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
-          ARGBToYJRow = ARGBToYJRow_SSSE3;
-        }
-      }
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1017,16 +1125,18 @@
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
     }
   }
 #endif
@@ -1047,6 +1157,88 @@
   return 0;
 }
 
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUVJ422Row_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
 // Convert ARGB to J400.
 LIBYUV_API
 int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
@@ -1071,19 +1263,15 @@
     src_stride_argb = dst_stride_yj = 0;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
-        ARGBToYJRow = ARGBToYJRow_SSSE3;
-      }
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1091,7 +1279,7 @@
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;

diff --git a/libvpx/third_party/libyuv/source/convert_to_argb.cc b/libvpx/third_party/libyuv/source/convert_to_argb.cc
index 1b228a7..af829fb 100644
--- a/libvpx/third_party/libyuv/source/convert_to_argb.cc
+++ b/libvpx/third_party/libyuv/source/convert_to_argb.cc

@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@@ -144,36 +143,6 @@
                          crop_argb, argb_stride,
                          crop_width, inv_crop_height);
       break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
       r = I400ToARGB(src, src_width,
@@ -205,15 +174,6 @@
                      crop_argb, argb_stride,
                      crop_width, inv_crop_height);
       break;
-//    case FOURCC_Q420:
-//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-//               src_width + crop_x * 2;
-//      r = Q420ToARGB(src, src_width * 3,
-//                    src_uv, src_width * 3,
-//                    crop_argb, argb_stride,
-//                    crop_width, inv_crop_height);
-//      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YU12:
@@ -241,6 +201,25 @@
                      crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_I422:
     case FOURCC_YV16: {
       const uint8* src_y = sample + src_width * crop_y + crop_x;

diff --git a/libvpx/third_party/libyuv/source/convert_to_i420.cc b/libvpx/third_party/libyuv/source/convert_to_i420.cc
index 7b194ff..5e75369 100644
--- a/libvpx/third_party/libyuv/source/convert_to_i420.cc
+++ b/libvpx/third_party/libyuv/source/convert_to_i420.cc

@@ -12,7 +12,6 @@
 
 #include "libyuv/convert.h"
 
-#include "libyuv/format_conversion.h"
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -173,40 +172,6 @@
                      v, v_stride,
                      crop_width, inv_crop_height);
       break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
       r = I400ToI420(src, src_width,
@@ -218,7 +183,8 @@
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       r = NV12ToI420Rotate(src, src_width,
                            src_uv, aligned_src_width,
                            y, y_stride,
@@ -228,7 +194,8 @@
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       // Call NV12 but with u and v parameters swapped.
       r = NV12ToI420Rotate(src, src_width,
                            src_uv, aligned_src_width,
@@ -245,17 +212,6 @@
                      v, v_stride,
                      crop_width, inv_crop_height);
       break;
-    case FOURCC_Q420:
-      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-               src_width + crop_x * 2;
-      r = Q420ToI420(src, src_width * 3,
-                    src_uv, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
-      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YU12:

diff --git a/libvpx/third_party/libyuv/source/cpu_id.cc b/libvpx/third_party/libyuv/source/cpu_id.cc
index 2e0d61d..8a10b00 100644
--- a/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/libvpx/third_party/libyuv/source/cpu_id.cc

@@ -10,12 +10,12 @@
 
 #include "libyuv/cpu_id.h"
 
-#if defined(_MSC_VER) && !defined(__clang__)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
@@ -36,19 +36,20 @@
 
 // For functions that use the stack and have runtime checks for overflow,
 // use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
 #define SAFEBUFFERS __declspec(safebuffers)
 #else
 #define SAFEBUFFERS
 #endif
 
-// Low level cpuid for X86. Returns zeros on other CPUs.
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__))
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER) && !defined(__clang__)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+// Visual C version uses intrinsic or inline x86 assembly.
 #if (_MSC_FULL_VER >= 160040219)
   __cpuidex((int*)(cpu_info), info_eax, info_ecx);
 #elif defined(_M_IX86)
@@ -69,7 +70,8 @@
     cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
   }
 #endif
-#else  // defined(_MSC_VER)
+// GCC version uses inline x86 assembly.
+#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
   uint32 info_ebx, info_edx;
   asm volatile (  // NOLINT
 #if defined( __i386__) && defined(__PIC__)
@@ -87,17 +89,26 @@
   cpu_info[1] = info_ebx;
   cpu_info[2] = info_ecx;
   cpu_info[3] = info_edx;
-#endif  // defined(_MSC_VER)
+#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
 }
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
 
-#if !defined(__native_client__)
+// TODO(fbarchard): Enable xgetbv when validator supports it.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 #define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int TestOsSaveYmm() {
   uint32 xcr0 = 0u;
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(_M_IX86)
+#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
   __asm {
     xor        ecx, ecx    // xcr 0
     _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
@@ -105,16 +116,10 @@
   }
 #elif defined(__i386__) || defined(__x86_64__)
   asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
-#endif  // defined(_MSC_VER)
+#endif  // defined(__i386__) || defined(__x86_64__)
   return((xcr0 & 6) == 6);  // Is ymm saved?
 }
-#endif  // !defined(__native_client__)
-#else
-LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
-#endif
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
@@ -134,6 +139,12 @@
         fclose(f);
         return kCpuHasNEON;
       }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
     }
   }
   fclose(f);
@@ -239,7 +250,8 @@
   if (TestEnv("LIBYUV_DISABLE_FMA3")) {
     cpu_info_ &= ~kCpuHasFMA3;
   }
-#elif defined(__mips__) && defined(__linux__)
+#endif
+#if defined(__mips__) && defined(__linux__)
   // Linux mips parse text file for dsp detect.
   cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
@@ -256,12 +268,19 @@
   if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
     cpu_info_ &= ~kCpuHasMIPS_DSPR2;
   }
-#elif defined(__arm__)
+#endif
+#if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
 #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
   cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
 #else
   // Linux arm parse text file for neon detect.
   cpu_info_ = ArmCpuCaps("/proc/cpuinfo");

diff --git a/libvpx/third_party/libyuv/source/format_conversion.cc b/libvpx/third_party/libyuv/source/format_conversion.cc
deleted file mode 100644
index a3daf96..0000000
--- a/libvpx/third_party/libyuv/source/format_conversion.cc
+++ /dev/null

@@ -1,552 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
-  return (uint32)(select0) |
-         (uint32)((select1 + 4) << 8) |
-         (uint32)((select0 + 8) << 16) |
-         (uint32)((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
-                         const int green_index,
-                         const int red_index,
-                         uint32 dst_fourcc_bayer,
-                         uint32* index_map) {
-  // Now build a lookup table containing the indices for the four pixels in each
-  // 2x2 Bayer grid.
-  switch (dst_fourcc_bayer) {
-    case FOURCC_BGGR:
-      index_map[0] = GenerateSelector(blue_index, green_index);
-      index_map[1] = GenerateSelector(green_index, red_index);
-      break;
-    case FOURCC_GBRG:
-      index_map[0] = GenerateSelector(green_index, blue_index);
-      index_map[1] = GenerateSelector(red_index, green_index);
-      break;
-    case FOURCC_RGGB:
-      index_map[0] = GenerateSelector(red_index, green_index);
-      index_map[1] = GenerateSelector(green_index, blue_index);
-      break;
-    case FOURCC_GRBG:
-      index_map[0] = GenerateSelector(green_index, red_index);
-      index_map[1] = GenerateSelector(blue_index, green_index);
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-  return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  int y;
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-
-  for (y = 0; y < height; ++y) {
-    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
-    src_argb += src_stride_argb;
-    dst_bayer += dst_stride_bayer;
-  }
-  return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 r = src_bayer1[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer0[0];
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = AVG(r, src_bayer1[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    r = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer0[0];
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = AVG(r, src_bayer1[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[0];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 b = src_bayer1[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer1[1]);
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = src_bayer0[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    b = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer1[1]);
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = src_bayer0[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer0[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 b = src_bayer0[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer0[1]);
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = src_bayer1[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[7] = 255U;
-    b = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer0[1]);
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = src_bayer1[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer1[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 r = src_bayer0[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer1[0];
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = AVG(r, src_bayer0[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-    r = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer1[0];
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = AVG(r, src_bayer0[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[0];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  int y;
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;    // Bad FourCC
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              dst_argb + dst_stride_argb, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_argb += dst_stride_argb * 2;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-  }
-  return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    int halfheight;
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
-    }
-  }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#endif
-
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
-    align_buffer_64(row, kRowSize * 2);
-    int y;
-    for (y = 0; y < height - 1; y += 2) {
-      BayerRow0(src_bayer, src_stride_bayer, row, width);
-      BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-                row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-      src_bayer += src_stride_bayer * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-      BayerRow0(src_bayer, src_stride_bayer, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  // Negative height means invert the image.
-  if (height < 0) {
-    int halfheight;
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-  {
-    // Allocate a row of ARGB.
-    align_buffer_64(row, width * 4);
-    int y;
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row, width);
-      ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
-      dst_bayer += dst_stride_bayer;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER)                                                 \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_y, int dst_stride_y,                       \
-                         uint8* dst_u, int dst_stride_u,                       \
-                         uint8* dst_v, int dst_stride_v,                       \
-                         int width, int height) {                              \
-  return BayerToI420(src_bayer, src_stride_bayer,                              \
-                     dst_y, dst_stride_y,                                      \
-                     dst_u, dst_stride_u,                                      \
-                     dst_v, dst_stride_v,                                      \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
-                       const uint8* src_u, int src_stride_u,                   \
-                       const uint8* src_v, int src_stride_v,                   \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return I420ToBayer(src_y, src_stride_y,                                      \
-                     src_u, src_stride_u,                                      \
-                     src_v, src_stride_v,                                      \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return ARGBToBayer(src_argb, src_stride_argb,                                \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_argb, int dst_stride_argb,                 \
-                         int width, int height) {                              \
-  return BayerToARGB(src_bayer, src_stride_bayer,                              \
-                     dst_argb, dst_stride_argb,                                \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif

diff --git a/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
index 15b0ed8..75f8a61 100644
--- a/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/libvpx/third_party/libyuv/source/mjpeg_decoder.cc

@@ -13,11 +13,17 @@
 #ifdef HAVE_JPEG
 #include <assert.h>
 
-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
-    !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
 #endif
 struct FILE;  // For jpeglib.h.
 
@@ -101,7 +107,7 @@
   }
 
   buf_.data = src;
-  buf_.len = (int)(src_len);
+  buf_.len = static_cast<int>(src_len);
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -411,7 +417,7 @@
 }
 
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
     assert(0 && "No more data");
     // ERROR: No more data
@@ -447,7 +453,7 @@
   // ERROR: Error in jpeglib: buf
 #endif
 
-  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
   // This rewinds the call stack to the point of the corresponding setjmp()
   // and causes it to return (for a second time) with value 1.
   longjmp(mgr->setjmp_buffer, 1);

diff --git a/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/libvpx/third_party/libyuv/source/mjpeg_validate.cc
index 23d22d0..8edfbe1 100644
--- a/libvpx/third_party/libyuv/source/mjpeg_validate.cc
+++ b/libvpx/third_party/libyuv/source/mjpeg_validate.cc

@@ -10,15 +10,66 @@
 
 #include "libyuv/mjpeg_decoder.h"
 
+#include <string.h>  // For memchr.
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked)
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // src
+    mov        eax, [esp + 8]   // val
+    mov        ecx, [esp + 12]  // count
+    repne scasb
+    jne        sr99
+    mov        eax, edi
+    sub        eax, 1
+    mov        edi, edx
+    ret
+
+  sr99:
+    mov        eax, 0
+    mov        edi, edx
+    ret
+  }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  const uint8* end = sample + sample_size - 1;
+  const uint8* it = sample;
+  for (;;) {
+#ifdef ENABLE_SCASB
+    it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+    if (it == NULL) {
+      break;
+    }
+    if (it[1] == 0xd9) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    ++it;  // Skip over current 0xff.
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
 // Helper function to validate the jpeg appears intact.
-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
 LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
-  size_t i;
+  const size_t kBackSearchSize = 1024;
   if (sample_size < 64) {
     // ERROR: Invalid jpeg size: sample_size
     return LIBYUV_FALSE;
@@ -27,17 +78,20 @@
     // ERROR: Invalid jpeg initial start code
     return LIBYUV_FALSE;
   }
-  for (i = sample_size - 2; i > 1;) {
-    if (sample[i] != 0xd9) {
-      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
-        return LIBYUV_TRUE;  // Success: Valid jpeg.
-      }
-      --i;
+  // Step over SOI marker.
+  sample += 2;
+  sample_size -= 2;
+
+  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
     }
-    --i;
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
   }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
-  return LIBYUV_FALSE;
+  return ScanEOI(sample, sample_size);
+
 }
 
 #ifdef __cplusplus

diff --git a/libvpx/third_party/libyuv/source/planar_functions.cc b/libvpx/third_party/libyuv/source/planar_functions.cc
index 3857008..b96bd50 100644
--- a/libvpx/third_party/libyuv/source/planar_functions.cc
+++ b/libvpx/third_party/libyuv/source/planar_functions.cc

@@ -41,16 +41,14 @@
   if (src_y == dst_y && src_stride_y == dst_stride_y) {
     return;
   }
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -59,8 +57,8 @@
   }
 #endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
 #if defined(HAS_COPYROW_MIPS)
@@ -90,15 +88,8 @@
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
-#if defined(HAS_COPYROW_16_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_16_X86;
-  }
-#endif
 #if defined(HAS_COPYROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_16_SSE2;
   }
 #endif
@@ -239,25 +230,43 @@
     src_stride_y = -src_stride_y;
   }
 #if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    MirrorRow = MirrorRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorRow = MirrorRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
   }
 #endif
 
@@ -298,23 +307,17 @@
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
     YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -324,7 +327,7 @@
   }
 #endif
 #if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
     if (width >= 16) {
       YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
@@ -376,23 +379,17 @@
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
     UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUV422Row = UYVYToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -402,7 +399,7 @@
   }
 #endif
 #if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
     if (width >= 16) {
       UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
@@ -497,22 +494,28 @@
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMirrorRow = ARGBMirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 
@@ -525,7 +528,7 @@
   return 0;
 }
 
-// Get a blender that optimized for the CPU, alignment and pixel count.
+// Get a blender that optimized for the CPU and pixel count.
 // As there are 6 blenders to choose from, the caller should try to use
 // the same blend function for all pixels if possible.
 LIBYUV_API
@@ -614,7 +617,7 @@
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
@@ -622,7 +625,7 @@
   }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@@ -630,7 +633,7 @@
   }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBMultiplyRow = ARGBMultiplyRow_NEON;
@@ -674,13 +677,13 @@
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_SSE2;
   }
 #endif
-#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBAddRow = ARGBAddRow_SSE2;
@@ -688,7 +691,7 @@
   }
 #endif
 #if defined(HAS_ARGBADDROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBAddRow = ARGBAddRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAddRow = ARGBAddRow_AVX2;
@@ -696,7 +699,7 @@
   }
 #endif
 #if defined(HAS_ARGBADDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBAddRow = ARGBAddRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBAddRow = ARGBAddRow_NEON;
@@ -741,7 +744,7 @@
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSUBTRACTROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBSubtractRow = ARGBSubtractRow_SSE2;
@@ -749,7 +752,7 @@
   }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBSubtractRow = ARGBSubtractRow_AVX2;
@@ -757,7 +760,7 @@
   }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBSubtractRow = ARGBSubtractRow_NEON;
@@ -808,24 +811,31 @@
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
   }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOBGRAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToBGRARow = I422ToBGRARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       I422ToBGRARow = I422ToBGRARow_NEON;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
-    }
-  }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -879,20 +889,26 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
   }
 #if defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     I422ToABGRRow = I422ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       I422ToABGRRow = I422ToABGRRow_NEON;
     }
   }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
-      }
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
     }
   }
 #endif
@@ -941,20 +957,26 @@
     src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
   }
 #if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       I422ToRGBARow = I422ToRGBARow_NEON;
     }
   }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
   }
 #endif
@@ -991,14 +1013,23 @@
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToRGB565Row = NV12ToRGB565Row_NEON;
@@ -1039,14 +1070,23 @@
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_NV21TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_NV21TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV21ToRGB565Row = NV21ToRGB565Row_NEON;
@@ -1070,8 +1110,12 @@
               int width, int height,
               uint32 value) {
   int y;
-  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
-  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
+  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
   // Coalesce rows.
   if (dst_stride_y == width) {
     width *= height;
@@ -1079,21 +1123,30 @@
     dst_stride_y = 0;
   }
 #if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    SetRow = SetRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_SETROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    SetRow = SetRow_X86;
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
   }
 #endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, v32, width);
+    SetRow(dst_y, value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -1112,7 +1165,7 @@
   uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
   uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
   if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       x < 0 || y < 0 ||
       value_y < 0 || value_y > 255 ||
       value_u < 0 || value_u > 255 ||
@@ -1132,11 +1185,18 @@
              int dst_x, int dst_y,
              int width, int height,
              uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
   if (!dst_argb ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
   dst_argb += dst_y * dst_stride_argb + dst_x * 4;
   // Coalesce rows.
   if (dst_stride_argb == width * 4) {
@@ -1144,20 +1204,26 @@
     height = 1;
     dst_stride_argb = 0;
   }
-#if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
-    return 0;
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_SETROW_X86)
+#if defined(HAS_ARGBSETROW_X86)
   if (TestCpuFlag(kCpuHasX86)) {
-    ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
-    return 0;
+    ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
-  ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
   return 0;
 }
 
@@ -1197,9 +1263,7 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@@ -1207,7 +1271,7 @@
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1215,7 +1279,7 @@
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@@ -1223,7 +1287,7 @@
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBAttenuateRow = ARGBAttenuateRow_NEON;
@@ -1263,7 +1327,7 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@@ -1271,7 +1335,7 @@
   }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@@ -1312,12 +1376,11 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
@@ -1350,11 +1413,11 @@
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
@@ -1383,11 +1446,11 @@
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSEPIAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_SSSE3;
   }
-#elif defined(HAS_ARGBSEPIAROW_NEON)
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
@@ -1425,11 +1488,11 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
   }
-#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
@@ -1568,11 +1631,11 @@
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBQUANTIZEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
   }
-#elif defined(HAS_ARGBQUANTIZEROW_NEON)
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBQuantizeRow = ARGBQuantizeRow_NEON;
   }
@@ -1743,12 +1806,11 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSHADEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBShadeRow = ARGBShadeRow_SSE2;
   }
-#elif defined(HAS_ARGBSHADEROW_NEON)
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
@@ -1790,33 +1852,23 @@
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -1824,19 +1876,19 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
       IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
 #endif
 
@@ -1876,7 +1928,7 @@
     src_stride_bgra = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBShuffleRow = ARGBShuffleRow_SSE2;
@@ -1884,19 +1936,15 @@
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        ARGBShuffleRow = ARGBShuffleRow_SSSE3;
-      }
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       ARGBShuffleRow = ARGBShuffleRow_AVX2;
@@ -1904,7 +1952,7 @@
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
     if (IS_ALIGNED(width, 4)) {
       ARGBShuffleRow = ARGBShuffleRow_NEON;
@@ -1928,8 +1976,8 @@
                                          const uint8* src_sobely,
                                          uint8* dst, int width)) {
   int y;
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
+      ARGBToYJRow_C;
   void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) = SobelYRow_C;
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
@@ -1945,33 +1993,32 @@
     src_argb  = src_argb  + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // ARGBToBayer used to select G channel from ARGB.
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerGGRow_NEON;
+      ARGBToYJRow = ARGBToYJRow_NEON;
     }
   }
 #endif
+
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelYRow = SobelYRow_SSE2;
@@ -1994,7 +2041,7 @@
 #endif
   {
     // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 15) & ~15;
+    const int kRowSize = (width + kEdge + 31) & ~31;
     align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
     uint8* row_sobelx = rows;
     uint8* row_sobely = rows + kRowSize;
@@ -2004,20 +2051,20 @@
     uint8* row_y0 = row_y + kEdge;
     uint8* row_y1 = row_y0 + kRowSize;
     uint8* row_y2 = row_y1 + kRowSize;
-    ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y1, width);
     row_y1[-1] = row_y1[0];
     memset(row_y1 + width, row_y1[width - 1], 16);
     memset(row_y2 + width, 0, 16);
 
     for (y = 0; y < height; ++y) {
-      // Convert next row of ARGB to Y.
+      // Convert next row of ARGB to G.
       if (y < (height - 1)) {
         src_argb += src_stride_argb;
       }
-      ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+      ARGBToYJRow(src_argb, row_y2, width);
       row_y2[-1] = row_y2[0];
       row_y2[width] = row_y2[width - 1];
 
@@ -2048,14 +2095,19 @@
   void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SobelRow = SobelRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SOBELROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    SobelRow = SobelRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
   }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2070,14 +2122,19 @@
   void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    SobelToPlaneRow = SobelToPlaneRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SOBELTOPLANEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    SobelToPlaneRow = SobelToPlaneRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
   }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
@@ -2093,14 +2150,19 @@
   void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SobelXYRow = SobelXYRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SOBELXYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    SobelXYRow = SobelXYRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
   }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2218,10 +2280,7 @@
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
-      IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
   }
 #endif
@@ -2264,10 +2323,7 @@
     src_stride_y = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
-      IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
   }
 #endif
@@ -2285,6 +2341,214 @@
   return 0;
 }
 
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, rows, awidth);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
+                 rows + awidth, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows, dst_y, awidth);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
+                 dst_y + dst_stride_y, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv

diff --git a/libvpx/third_party/libyuv/source/rotate.cc b/libvpx/third_party/libyuv/source/rotate.cc
index 2ef3228..be3d589 100644
--- a/libvpx/third_party/libyuv/source/rotate.cc
+++ b/libvpx/third_party/libyuv/source/rotate.cc

@@ -13,6 +13,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -20,815 +21,39 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#if defined(__APPLE__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".private_extern _" #name "                \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-"_" #name ":                                   \n"
-#else
-#define DECLARE_FUNCTION(name)                                                 \
-    ".text                                     \n"                             \
-    ".align 4,0x90                             \n"                             \
-#name ":                                       \n"
-#endif
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
-#define HAS_TRANSPOSE_WX8_NEON
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWX8_NEON
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int width);
-#endif  // defined(__ARM_NEON__)
-
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
-                                  uint8* dst, int dst_stride, int width);
-#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b,
-                               int width);
-#endif  // defined(__mips__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
-    defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked) __declspec(align(16))
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                               uint8* dst, int dst_stride, int width) {
-  __asm {
-    push      edi
-    push      esi
-    push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
-    mov       edx, [esp + 12 + 12]  // dst
-    mov       esi, [esp + 12 + 16]  // dst_stride
-    mov       ecx, [esp + 12 + 20]  // width
-
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    align      4
- convertloop:
-    movq      xmm0, qword ptr [eax]
-    lea       ebp, [eax + 8]
-    movq      xmm1, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm0, xmm1
-    movq      xmm2, qword ptr [eax]
-    movdqa    xmm1, xmm0
-    palignr   xmm1, xmm1, 8
-    movq      xmm3, qword ptr [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    punpcklbw xmm2, xmm3
-    movdqa    xmm3, xmm2
-    movq      xmm4, qword ptr [eax]
-    palignr   xmm3, xmm3, 8
-    movq      xmm5, qword ptr [eax + edi]
-    punpcklbw xmm4, xmm5
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm5, xmm4
-    movq      xmm6, qword ptr [eax]
-    palignr   xmm5, xmm5, 8
-    movq      xmm7, qword ptr [eax + edi]
-    punpcklbw xmm6, xmm7
-    mov       eax, ebp
-    movdqa    xmm7, xmm6
-    palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
-    punpcklwd xmm0, xmm2
-    punpcklwd xmm1, xmm3
-    movdqa    xmm2, xmm0
-    movdqa    xmm3, xmm1
-    palignr   xmm2, xmm2, 8
-    palignr   xmm3, xmm3, 8
-    punpcklwd xmm4, xmm6
-    punpcklwd xmm5, xmm7
-    movdqa    xmm6, xmm4
-    movdqa    xmm7, xmm5
-    palignr   xmm6, xmm6, 8
-    palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    punpckldq xmm0, xmm4
-    movq      qword ptr [edx], xmm0
-    movdqa    xmm4, xmm0
-    palignr   xmm4, xmm4, 8
-    movq      qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    punpckldq xmm2, xmm6
-    movdqa    xmm6, xmm2
-    palignr   xmm6, xmm6, 8
-    movq      qword ptr [edx], xmm2
-    punpckldq xmm1, xmm5
-    movq      qword ptr [edx + esi], xmm6
-    lea       edx, [edx + 2 * esi]
-    movdqa    xmm5, xmm1
-    movq      qword ptr [edx], xmm1
-    palignr   xmm5, xmm5, 8
-    punpckldq xmm3, xmm7
-    movq      qword ptr [edx + esi], xmm5
-    lea       edx, [edx + 2 * esi]
-    movq      qword ptr [edx], xmm3
-    movdqa    xmm7, xmm3
-    palignr   xmm7, xmm7, 8
-    sub       ecx, 8
-    movq      qword ptr [edx + esi], xmm7
-    lea       edx, [edx + 2 * esi]
-    jg        convertloop
-
-    pop       ebp
-    pop       esi
-    pop       edi
-    ret
-  }
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked) __declspec(align(16))
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                uint8* dst_a, int dst_stride_a,
-                                uint8* dst_b, int dst_stride_b,
-                                int w) {
-  __asm {
-    push      ebx
-    push      esi
-    push      edi
-    push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
-    mov       edx, [esp + 16 + 12]  // dst_a
-    mov       esi, [esp + 16 + 16]  // dst_stride_a
-    mov       ebx, [esp + 16 + 20]  // dst_b
-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
-    mov       ecx, esp
-    sub       esp, 4 + 16
-    and       esp, ~15
-    mov       [esp + 16], ecx
-    mov       ecx, [ecx + 16 + 28]  // w
-
-    align      4
- convertloop:
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    movdqa    xmm0, [eax]
-    movdqa    xmm1, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
-    punpcklbw xmm0, xmm1
-    punpckhbw xmm7, xmm1
-    movdqa    xmm1, xmm7
-    movdqa    xmm2, [eax]
-    movdqa    xmm3, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm2
-    punpcklbw xmm2, xmm3
-    punpckhbw xmm7, xmm3
-    movdqa    xmm3, xmm7
-    movdqa    xmm4, [eax]
-    movdqa    xmm5, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    xmm7, xmm4
-    punpcklbw xmm4, xmm5
-    punpckhbw xmm7, xmm5
-    movdqa    xmm5, xmm7
-    movdqa    xmm6, [eax]
-    movdqa    xmm7, [eax + edi]
-    lea       eax, [eax + 2 * edi]
-    movdqa    [esp], xmm5  // backup xmm5
-    neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
-    punpcklbw xmm6, xmm7
-    punpckhbw xmm5, xmm7
-    movdqa    xmm7, xmm5
-    lea       eax, [eax + 8 * edi + 16]
-    neg       edi
-    // Second round of bit swap.
-    movdqa    xmm5, xmm0
-    punpcklwd xmm0, xmm2
-    punpckhwd xmm5, xmm2
-    movdqa    xmm2, xmm5
-    movdqa    xmm5, xmm1
-    punpcklwd xmm1, xmm3
-    punpckhwd xmm5, xmm3
-    movdqa    xmm3, xmm5
-    movdqa    xmm5, xmm4
-    punpcklwd xmm4, xmm6
-    punpckhwd xmm5, xmm6
-    movdqa    xmm6, xmm5
-    movdqa    xmm5, [esp]  // restore xmm5
-    movdqa    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
-    punpcklwd xmm5, xmm7
-    punpckhwd xmm6, xmm7
-    movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    movdqa    xmm6, xmm0
-    punpckldq xmm0, xmm4
-    punpckhdq xmm6, xmm4
-    movdqa    xmm4, xmm6
-    movdqa    xmm6, [esp]  // restore xmm6
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [ebx], xmm0
-    movlpd    qword ptr [edx + esi], xmm4
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm4
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
-    punpckldq xmm2, xmm6
-    movlpd    qword ptr [edx], xmm2
-    movhpd    qword ptr [ebx], xmm2
-    punpckhdq xmm0, xmm6
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
-    punpckldq xmm1, xmm5
-    movlpd    qword ptr [edx], xmm1
-    movhpd    qword ptr [ebx], xmm1
-    punpckhdq xmm0, xmm5
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
-    punpckldq xmm3, xmm7
-    movlpd    qword ptr [edx], xmm3
-    movhpd    qword ptr [ebx], xmm3
-    punpckhdq xmm0, xmm7
-    sub       ecx, 8
-    movlpd    qword ptr [edx + esi], xmm0
-    lea       edx, [edx + 2 * esi]
-    movhpd    qword ptr [ebx + ebp], xmm0
-    lea       ebx, [ebx + 2 * ebp]
-    jg        convertloop
-
-    mov       esp, [esp + 16]
-    pop       ebp
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-  }
-}
-#elif !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
-#define HAS_TRANSPOSE_WX8_SSSE3
-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                               uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    ".p2align  2                                 \n"
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc"
-  #if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  #endif
-  );
-}
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
-#define HAS_TRANSPOSE_UVWX8_SSE2
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w);
-  asm (
-    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
-    "push   %ebx                               \n"
-    "push   %esi                               \n"
-    "push   %edi                               \n"
-    "push   %ebp                               \n"
-    "mov    0x14(%esp),%eax                    \n"
-    "mov    0x18(%esp),%edi                    \n"
-    "mov    0x1c(%esp),%edx                    \n"
-    "mov    0x20(%esp),%esi                    \n"
-    "mov    0x24(%esp),%ebx                    \n"
-    "mov    0x28(%esp),%ebp                    \n"
-    "mov    %esp,%ecx                          \n"
-    "sub    $0x14,%esp                         \n"
-    "and    $0xfffffff0,%esp                   \n"
-    "mov    %ecx,0x10(%esp)                    \n"
-    "mov    0x2c(%ecx),%ecx                    \n"
-
-"1:                                            \n"
-    "movdqa (%eax),%xmm0                       \n"
-    "movdqa (%eax,%edi,1),%xmm1                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm0,%xmm7                        \n"
-    "punpcklbw %xmm1,%xmm0                     \n"
-    "punpckhbw %xmm1,%xmm7                     \n"
-    "movdqa %xmm7,%xmm1                        \n"
-    "movdqa (%eax),%xmm2                       \n"
-    "movdqa (%eax,%edi,1),%xmm3                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm2,%xmm7                        \n"
-    "punpcklbw %xmm3,%xmm2                     \n"
-    "punpckhbw %xmm3,%xmm7                     \n"
-    "movdqa %xmm7,%xmm3                        \n"
-    "movdqa (%eax),%xmm4                       \n"
-    "movdqa (%eax,%edi,1),%xmm5                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm4,%xmm7                        \n"
-    "punpcklbw %xmm5,%xmm4                     \n"
-    "punpckhbw %xmm5,%xmm7                     \n"
-    "movdqa %xmm7,%xmm5                        \n"
-    "movdqa (%eax),%xmm6                       \n"
-    "movdqa (%eax,%edi,1),%xmm7                \n"
-    "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm5,(%esp)                       \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm6,%xmm5                        \n"
-    "punpcklbw %xmm7,%xmm6                     \n"
-    "punpckhbw %xmm7,%xmm5                     \n"
-    "movdqa %xmm5,%xmm7                        \n"
-    "lea    0x10(%eax,%edi,8),%eax             \n"
-    "neg    %edi                               \n"
-    "movdqa %xmm0,%xmm5                        \n"
-    "punpcklwd %xmm2,%xmm0                     \n"
-    "punpckhwd %xmm2,%xmm5                     \n"
-    "movdqa %xmm5,%xmm2                        \n"
-    "movdqa %xmm1,%xmm5                        \n"
-    "punpcklwd %xmm3,%xmm1                     \n"
-    "punpckhwd %xmm3,%xmm5                     \n"
-    "movdqa %xmm5,%xmm3                        \n"
-    "movdqa %xmm4,%xmm5                        \n"
-    "punpcklwd %xmm6,%xmm4                     \n"
-    "punpckhwd %xmm6,%xmm5                     \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "movdqa (%esp),%xmm5                       \n"
-    "movdqa %xmm6,(%esp)                       \n"
-    "movdqa %xmm5,%xmm6                        \n"
-    "punpcklwd %xmm7,%xmm5                     \n"
-    "punpckhwd %xmm7,%xmm6                     \n"
-    "movdqa %xmm6,%xmm7                        \n"
-    "movdqa %xmm0,%xmm6                        \n"
-    "punpckldq %xmm4,%xmm0                     \n"
-    "punpckhdq %xmm4,%xmm6                     \n"
-    "movdqa %xmm6,%xmm4                        \n"
-    "movdqa (%esp),%xmm6                       \n"
-    "movlpd %xmm0,(%edx)                       \n"
-    "movhpd %xmm0,(%ebx)                       \n"
-    "movlpd %xmm4,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm2,%xmm0                        \n"
-    "punpckldq %xmm6,%xmm2                     \n"
-    "movlpd %xmm2,(%edx)                       \n"
-    "movhpd %xmm2,(%ebx)                       \n"
-    "punpckhdq %xmm6,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm1,%xmm0                        \n"
-    "punpckldq %xmm5,%xmm1                     \n"
-    "movlpd %xmm1,(%edx)                       \n"
-    "movhpd %xmm1,(%ebx)                       \n"
-    "punpckhdq %xmm5,%xmm0                     \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "movdqa %xmm3,%xmm0                        \n"
-    "punpckldq %xmm7,%xmm3                     \n"
-    "movlpd %xmm3,(%edx)                       \n"
-    "movhpd %xmm3,(%ebx)                       \n"
-    "punpckhdq %xmm7,%xmm0                     \n"
-    "sub    $0x8,%ecx                          \n"
-    "movlpd %xmm0,(%edx,%esi,1)                \n"
-    "lea    (%edx,%esi,2),%edx                 \n"
-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
-    "lea    (%ebx,%ebp,2),%ebx                 \n"
-    "jg     1b                                 \n"
-    "mov    0x10(%esp),%esp                    \n"
-    "pop    %ebp                               \n"
-    "pop    %edi                               \n"
-    "pop    %esi                               \n"
-    "pop    %ebx                               \n"
-#if defined(__native_client__)
-    "pop    %ecx                               \n"
-    "and    $0xffffffe0,%ecx                   \n"
-    "jmp    *%ecx                              \n"
-#else
-    "ret                                       \n"
-#endif
-);
-#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
-#define HAS_TRANSPOSE_WX8_FAST_SSSE3
-static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
-                                    uint8* dst, int dst_stride, int width) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  2                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%3),%%xmm1                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
-  "movdqa     %%xmm0,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm9                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm1               \n"
-  "palignr    $0x8,%%xmm9,%%xmm9               \n"
-  "movdqa     (%0,%3),%%xmm3                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm10                   \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm10                   \n"
-  "movdqa     %%xmm2,%%xmm3                    \n"
-  "movdqa     %%xmm10,%%xmm11                  \n"
-  "movdqa     (%0),%%xmm4                      \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "movdqa     (%0,%3),%%xmm5                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm12                   \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm12                   \n"
-  "movdqa     %%xmm4,%%xmm5                    \n"
-  "movdqa     %%xmm12,%%xmm13                  \n"
-  "movdqa     (%0),%%xmm6                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movdqa     (%0,%3),%%xmm7                   \n"
-  "lea        (%0,%3,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm14                   \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "punpckhbw  %%xmm7,%%xmm14                   \n"
-  "neg        %3                               \n"
-  "movdqa     %%xmm6,%%xmm7                    \n"
-  "movdqa     %%xmm14,%%xmm15                  \n"
-  "lea        0x10(%0,%3,8),%0                 \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "neg        %3                               \n"
-   // Second round of bit swap.
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm0,%%xmm2                    \n"
-  "movdqa     %%xmm1,%%xmm3                    \n"
-  "palignr    $0x8,%%xmm2,%%xmm2               \n"
-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm4,%%xmm6                    \n"
-  "movdqa     %%xmm5,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "punpcklwd  %%xmm10,%%xmm8                   \n"
-  "punpcklwd  %%xmm11,%%xmm9                   \n"
-  "movdqa     %%xmm8,%%xmm10                   \n"
-  "movdqa     %%xmm9,%%xmm11                   \n"
-  "palignr    $0x8,%%xmm10,%%xmm10             \n"
-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "punpcklwd  %%xmm14,%%xmm12                  \n"
-  "punpcklwd  %%xmm15,%%xmm13                  \n"
-  "movdqa     %%xmm12,%%xmm14                  \n"
-  "movdqa     %%xmm13,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movq       %%xmm0,(%1)                      \n"
-  "movdqa     %%xmm0,%%xmm4                    \n"
-  "palignr    $0x8,%%xmm4,%%xmm4               \n"
-  "movq       %%xmm4,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movdqa     %%xmm2,%%xmm6                    \n"
-  "movq       %%xmm2,(%1)                      \n"
-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movq       %%xmm6,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm1,%%xmm5                    \n"
-  "movq       %%xmm1,(%1)                      \n"
-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
-  "movq       %%xmm5,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movq       %%xmm3,(%1)                      \n"
-  "movdqa     %%xmm3,%%xmm7                    \n"
-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
-  "movq       %%xmm7,(%1,%4)                   \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm12,%%xmm8                   \n"
-  "movq       %%xmm8,(%1)                      \n"
-  "movdqa     %%xmm8,%%xmm12                   \n"
-  "palignr    $0x8,%%xmm12,%%xmm12             \n"
-  "movq       %%xmm12,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm14,%%xmm10                  \n"
-  "movdqa     %%xmm10,%%xmm14                  \n"
-  "movq       %%xmm10,(%1)                     \n"
-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
-  "punpckldq  %%xmm13,%%xmm9                   \n"
-  "movq       %%xmm14,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "movdqa     %%xmm9,%%xmm13                   \n"
-  "movq       %%xmm9,(%1)                      \n"
-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movq       %%xmm13,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "punpckldq  %%xmm15,%%xmm11                  \n"
-  "movq       %%xmm11,(%1)                     \n"
-  "movdqa     %%xmm11,%%xmm15                  \n"
-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
-  "sub        $0x10,%2                         \n"
-  "movq       %%xmm15,(%1,%4)                  \n"
-  "lea        (%1,%4,2),%1                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(dst_stride))   // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-);
-}
-
-#define HAS_TRANSPOSE_UVWX8_SSE2
-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                                uint8* dst_a, int dst_stride_a,
-                                uint8* dst_b, int dst_stride_b,
-                                int w) {
-  asm volatile (
-  // Read in the data from the source pointer.
-  // First round of bit swap.
-  ".p2align  2                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%4),%%xmm1                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpcklbw  %%xmm1,%%xmm0                    \n"
-  "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm1                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
-  "movdqa     (%0,%4),%%xmm3                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpcklbw  %%xmm3,%%xmm2                    \n"
-  "punpckhbw  %%xmm3,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm3                    \n"
-  "movdqa     (%0),%%xmm4                      \n"
-  "movdqa     (%0,%4),%%xmm5                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "punpcklbw  %%xmm5,%%xmm4                    \n"
-  "punpckhbw  %%xmm5,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm5                    \n"
-  "movdqa     (%0),%%xmm6                      \n"
-  "movdqa     (%0,%4),%%xmm7                   \n"
-  "lea        (%0,%4,2),%0                     \n"
-  "movdqa     %%xmm6,%%xmm8                    \n"
-  "punpcklbw  %%xmm7,%%xmm6                    \n"
-  "neg        %4                               \n"
-  "lea        0x10(%0,%4,8),%0                 \n"
-  "punpckhbw  %%xmm7,%%xmm8                    \n"
-  "movdqa     %%xmm8,%%xmm7                    \n"
-  "neg        %4                               \n"
-   // Second round of bit swap.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "movdqa     %%xmm1,%%xmm9                    \n"
-  "punpckhwd  %%xmm2,%%xmm8                    \n"
-  "punpckhwd  %%xmm3,%%xmm9                    \n"
-  "punpcklwd  %%xmm2,%%xmm0                    \n"
-  "punpcklwd  %%xmm3,%%xmm1                    \n"
-  "movdqa     %%xmm8,%%xmm2                    \n"
-  "movdqa     %%xmm9,%%xmm3                    \n"
-  "movdqa     %%xmm4,%%xmm8                    \n"
-  "movdqa     %%xmm5,%%xmm9                    \n"
-  "punpckhwd  %%xmm6,%%xmm8                    \n"
-  "punpckhwd  %%xmm7,%%xmm9                    \n"
-  "punpcklwd  %%xmm6,%%xmm4                    \n"
-  "punpcklwd  %%xmm7,%%xmm5                    \n"
-  "movdqa     %%xmm8,%%xmm6                    \n"
-  "movdqa     %%xmm9,%%xmm7                    \n"
-  // Third round of bit swap.
-  // Write to the destination pointer.
-  "movdqa     %%xmm0,%%xmm8                    \n"
-  "punpckldq  %%xmm4,%%xmm0                    \n"
-  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-  "punpckhdq  %%xmm4,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm2,%%xmm8                    \n"
-  "punpckldq  %%xmm6,%%xmm2                    \n"
-  "movlpd     %%xmm2,(%1)                      \n"
-  "movhpd     %%xmm2,(%2)                      \n"
-  "punpckhdq  %%xmm6,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm1,%%xmm8                    \n"
-  "punpckldq  %%xmm5,%%xmm1                    \n"
-  "movlpd     %%xmm1,(%1)                      \n"
-  "movhpd     %%xmm1,(%2)                      \n"
-  "punpckhdq  %%xmm5,%%xmm8                    \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "movdqa     %%xmm3,%%xmm8                    \n"
-  "punpckldq  %%xmm7,%%xmm3                    \n"
-  "movlpd     %%xmm3,(%1)                      \n"
-  "movhpd     %%xmm3,(%2)                      \n"
-  "punpckhdq  %%xmm7,%%xmm8                    \n"
-  "sub        $0x8,%3                          \n"
-  "movlpd     %%xmm8,(%1,%5)                   \n"
-  "lea        (%1,%5,2),%1                     \n"
-  "movhpd     %%xmm8,(%2,%6)                   \n"
-  "lea        (%2,%6,2),%2                     \n"
-  "jg         1b                               \n"
-  : "+r"(src),    // %0
-    "+r"(dst_a),  // %1
-    "+r"(dst_b),  // %2
-    "+r"(w)   // %3
-  : "r"((intptr_t)(src_stride)),    // %4
-    "r"((intptr_t)(dst_stride_a)),  // %5
-    "r"((intptr_t)(dst_stride_b))   // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-    "xmm8", "xmm9"
-);
-}
-#endif
-#endif
-
-static void TransposeWx8_C(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride,
-                           int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst[0] = src[0 * src_stride];
-    dst[1] = src[1 * src_stride];
-    dst[2] = src[2 * src_stride];
-    dst[3] = src[3 * src_stride];
-    dst[4] = src[4 * src_stride];
-    dst[5] = src[5 * src_stride];
-    dst[6] = src[6 * src_stride];
-    dst[7] = src[7 * src_stride];
-    ++src;
-    dst += dst_stride;
-  }
-}
-
-static void TransposeWxH_C(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride,
-                           int width, int height) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst[i * dst_stride + j] = src[j * src_stride + i];
-    }
-  }
-}
-
 LIBYUV_API
 void TransposePlane(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
   int i = height;
   void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
-                       int width) = TransposeWx8_C;
-#if defined(HAS_TRANSPOSE_WX8_NEON)
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
   }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
-    TransposeWx8 = TransposeWx8_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    TransposeWx8 = TransposeWx8_FAST_SSSE3;
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
   }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     if (IS_ALIGNED(width, 4) &&
         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
     } else {
       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
     }
@@ -843,7 +68,9 @@
     i -= 8;
   }
 
-  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
 }
 
 LIBYUV_API
@@ -883,29 +110,38 @@
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorRow = MirrorRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
   }
 #endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
@@ -913,21 +149,14 @@
     MirrorRow = MirrorRow_MIPS_DSPR2;
   }
 #endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
-  }
-#endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -935,6 +164,11 @@
     CopyRow = CopyRow_ERMS;
   }
 #endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
   if (TestCpuFlag(kCpuHasMIPS)) {
     CopyRow = CopyRow_MIPS;
@@ -954,48 +188,6 @@
   free_aligned_buffer_64(row);
 }
 
-static void TransposeUVWx8_C(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b,
-                             int width) {
-  int i;
-  for (i = 0; i < width; ++i) {
-    dst_a[0] = src[0 * src_stride + 0];
-    dst_b[0] = src[0 * src_stride + 1];
-    dst_a[1] = src[1 * src_stride + 0];
-    dst_b[1] = src[1 * src_stride + 1];
-    dst_a[2] = src[2 * src_stride + 0];
-    dst_b[2] = src[2 * src_stride + 1];
-    dst_a[3] = src[3 * src_stride + 0];
-    dst_b[3] = src[3 * src_stride + 1];
-    dst_a[4] = src[4 * src_stride + 0];
-    dst_b[4] = src[4 * src_stride + 1];
-    dst_a[5] = src[5 * src_stride + 0];
-    dst_b[5] = src[5 * src_stride + 1];
-    dst_a[6] = src[6 * src_stride + 0];
-    dst_b[6] = src[6 * src_stride + 1];
-    dst_a[7] = src[7 * src_stride + 0];
-    dst_b[7] = src[7 * src_stride + 1];
-    src += 2;
-    dst_a += dst_stride_a;
-    dst_b += dst_stride_b;
-  }
-}
-
-static void TransposeUVWxH_C(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b,
-                             int width, int height) {
-  int i;
-  for (i = 0; i < width * 2; i += 2) {
-    int j;
-    for (j = 0; j < height; ++j) {
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
-    }
-  }
-}
-
 LIBYUV_API
 void TransposeUV(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
@@ -1006,17 +198,17 @@
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
   }
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     TransposeUVWx8 = TransposeUVWx8_SSE2;
   }
-#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+#endif
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@@ -1035,10 +227,12 @@
     i -= 8;
   }
 
-  TransposeUVWxH_C(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
-                   width, i);
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
 }
 
 LIBYUV_API
@@ -1084,12 +278,13 @@
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     MirrorRowUV = MirrorUVRow_NEON;
   }
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
     MirrorRowUV = MirrorUVRow_SSSE3;
   }
-#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;

diff --git a/libvpx/third_party/libyuv/source/rotate_any.cc b/libvpx/third_party/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000..4d6eb34
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/rotate_any.cc

@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
+TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
+#endif
+
+#undef TANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+

diff --git a/libvpx/third_party/libyuv/source/rotate_argb.cc b/libvpx/third_party/libyuv/source/rotate_argb.cc
index ab0f9ce..787c0ad 100644
--- a/libvpx/third_party/libyuv/source/rotate_argb.cc
+++ b/libvpx/third_party/libyuv/source/rotate_argb.cc

@@ -27,36 +27,31 @@
     (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width);
+                               int src_stepx, uint8* dst_ptr, int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
-                               uint8* dst_ptr, int dst_width);
+                               int src_stepx, uint8* dst_ptr, int dst_width);
 #endif
 
 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx,
-                            uint8* dst_ptr, int dst_width);
+                            int src_stepx, uint8* dst_ptr, int dst_width);
 
 static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride,
-                          int width, int height) {
+                          uint8* dst, int dst_stride, int width, int height) {
   int i;
   int src_pixel_step = src_stride >> 2;
   void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
       int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
   }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(src, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
   }
 #endif
@@ -69,8 +64,7 @@
 }
 
 void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride,
-                  int width, int height) {
+                  uint8* dst, int dst_stride, int width, int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -80,8 +74,7 @@
 }
 
 void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+                    uint8* dst, int dst_stride, int width, int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -91,8 +84,7 @@
 }
 
 void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+                   uint8* dst, int dst_stride, int width, int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
   const uint8* src_bot = src + src_stride * (height - 1);
@@ -102,38 +94,38 @@
   void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
       ARGBMirrorRow_C;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMirrorRow = ARGBMirrorRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    CopyRow = CopyRow_X86;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -141,6 +133,11 @@
     CopyRow = CopyRow_ERMS;
   }
 #endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
   if (TestCpuFlag(kCpuHasMIPS)) {
     CopyRow = CopyRow_MIPS;
@@ -162,8 +159,7 @@
 
 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
                enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;

diff --git a/libvpx/third_party/libyuv/source/rotate_common.cc b/libvpx/third_party/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000..b33a9a0
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/rotate_common.cc

@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/rotate_gcc.cc b/libvpx/third_party/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000..fd385bc
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/rotate_gcc.cc

@@ -0,0 +1,493 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  2                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
+
+"1:                                            \n"
+    "movdqu (%eax),%xmm0                       \n"
+    "movdqu (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqu (%eax),%xmm2                       \n"
+    "movdqu (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqu (%eax),%xmm4                       \n"
+    "movdqu (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqu (%eax),%xmm6                       \n"
+    "movdqu (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqu %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqu (%esp),%xmm5                       \n"
+    "movdqu %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqu (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
+    "ret                                       \n"
+#endif
+);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqu     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqu     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqu     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(dst_stride))   // %4
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+);
+}
+
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqu     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "movdqu     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "movdqu     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(width)   // %3
+  : "r"((intptr_t)(src_stride)),    // %4
+    "r"((intptr_t)(dst_stride_a)),  // %5
+    "r"((intptr_t)(dst_stride_b))   // %6
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/rotate_mips.cc b/libvpx/third_party/libyuv/source/rotate_mips.cc
index 70770fd..efe6bd9 100644
--- a/libvpx/third_party/libyuv/source/rotate_mips.cc
+++ b/libvpx/third_party/libyuv/source/rotate_mips.cc

@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -22,8 +23,7 @@
     (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride,
-                             int width) {
+                             uint8* dst, int dst_stride, int width) {
    __asm__ __volatile__ (
       ".set push                                         \n"
       ".set noreorder                                    \n"
@@ -106,9 +106,8 @@
   );
 }
 
-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
-                                  uint8* dst, int dst_stride,
-                                  int width) {
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width) {
   __asm__ __volatile__ (
       ".set noat                                         \n"
       ".set push                                         \n"

diff --git a/libvpx/third_party/libyuv/source/rotate_neon.cc b/libvpx/third_party/libyuv/source/rotate_neon.cc
index d354e11..76043b3 100644
--- a/libvpx/third_party/libyuv/source/rotate_neon.cc
+++ b/libvpx/third_party/libyuv/source/rotate_neon.cc

@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -17,7 +18,8 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 static uvec8 kVTbl4x4Transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
@@ -525,7 +527,7 @@
       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
   );
 }
-#endif
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/rotate_neon64.cc b/libvpx/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000..f52c082
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/rotate_neon64.cc

@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/rotate_win.cc b/libvpx/third_party/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000..2760066
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/rotate_win.cc

@@ -0,0 +1,248 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/row_any.cc b/libvpx/third_party/libyuv/source/row_any.cc
index 97ef844..1cb1f6b 100644
--- a/libvpx/third_party/libyuv/source/row_any.cc
+++ b/libvpx/third_party/libyuv/source/row_any.cc

@@ -10,6 +10,8 @@
 
 #include "libyuv/row.h"
 
+#include <string.h>  // For memset.
+
 #include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
@@ -17,526 +19,660 @@
 extern "C" {
 #endif
 
-// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
-// TODO(fbarchard): Consider 'any' functions handling odd alignment.
-// YUV to RGB does multiple of 8 with SIMD and remainder with C.
-#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK)        \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* u_buf,                                           \
-                 const uint8* v_buf,                                           \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
       int n = width & ~MASK;                                                   \
-      I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n);                         \
-      I420TORGB_C(y_buf + n,                                                   \
-                  u_buf + (n >> UV_SHIFT),                                     \
-                  v_buf + (n >> UV_SHIFT),                                     \
-                  rgb_buf + n * BPP, width & MASK);                            \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
     }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
-     1, 4, 7)
-#endif  // HAS_I422TOARGBROW_SSSE3
+ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
 #ifdef HAS_I444TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
-     0, 4, 7)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
-     2, 4, 7)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
-     1, 4, 7)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
-     1, 4, 7)
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
-     1, 4, 7)
-// I422ToRGB565Row_SSSE3 is unaligned.
-YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
-     1, 2, 7)
-YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
-     1, 2, 7)
-YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
-     1, 2, 7)
-// I422ToRGB24Row_SSSE3 is unaligned.
-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
-YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
+ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
 #endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORAWROW_AVX2
+ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_J422TOARGBROW_SSSE3
+ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_J422TOARGBROW_AVX2
+ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
 #ifdef HAS_I422TOARGBROW_AVX2
-YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
-#endif  // HAS_I422TOARGBROW_AVX2
+ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_NEON
-YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
-YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
-YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
-YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
-YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
-YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
-     1, 2, 7)
-YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
-     1, 2, 7)
-YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
-#endif  // HAS_I422TOARGBROW_NEON
-#undef YANY
+ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
+ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#undef ANY31
 
-// Wrappers to handle odd width
-#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP)             \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* uv_buf,                                          \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
-      int n = width & ~7;                                                      \
-      NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n);                               \
-      NV12TORGB_C(y_buf + n,                                                   \
-                  uv_buf + (n >> UV_SHIFT),                                    \
-                  rgb_buf + n * BPP, width & 7);                               \
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
     }
 
+// Biplanar to RGB.
 #ifdef HAS_NV12TOARGBROW_SSSE3
-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
-      0, 4)
-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
-      0, 4)
-#endif  // HAS_NV12TOARGBROW_SSSE3
+ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
 #ifdef HAS_NV12TOARGBROW_NEON
-NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
-NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
-#endif  // HAS_NV12TOARGBROW_NEON
+ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
-NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
-      0, 2)
-NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
-      0, 2)
-#endif  // HAS_NV12TORGB565ROW_SSSE3
+ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
 #ifdef HAS_NV12TORGB565ROW_NEON
-NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
-NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
-#endif  // HAS_NV12TORGB565ROW_NEON
-#undef NVANY
-
-#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \
-    void NAMEANY(const uint8* src,                                             \
-                 uint8* dst,                                                   \
-                 int width) {                                                  \
-      int n = width & ~MASK;                                                   \
-      ARGBTORGB_SIMD(src, dst, n);                                             \
-      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK);                \
-    }
-
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
-       15, 4, 3)
-RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
-       15, 4, 3)
-RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
-       3, 4, 2)
-RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
-       3, 4, 2)
-RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
-       3, 4, 2)
-#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
-       7, 1, 4)
-#endif
-#if defined(HAS_YTOARGBROW_SSE2)
-RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
-       7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
-       15, 2, 4)
-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
-       15, 2, 4)
-// These require alignment on ARGB, so C is used for remainder.
-RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
-       15, 3, 4)
-RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
-       15, 3, 4)
-RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
-       7, 2, 4)
-RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
-       7, 2, 4)
-RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
-       7, 2, 4)
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
-RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
-       7, 4, 2)
-RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
-       7, 4, 2)
-RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
-       7, 4, 2)
-RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
-       7, 1, 4)
-RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
-       7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
-       7, 2, 4)
-RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
-       7, 2, 4)
-#endif
-#undef RGBANY
-
-// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
-#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)        \
-    void NAMEANY(const uint8* src,                                             \
-                 uint8* dst, uint32 selector,                                  \
-                 int width) {                                                  \
-      int n = width & ~MASK;                                                   \
-      ARGBTORGB_SIMD(src, dst, selector, n);                                   \
-      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \
-    }
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
-         7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERROW_NEON)
-BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
-         7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
-BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
-         7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
-BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
-         7, 4, 1)
+ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
 #endif
 
-#undef BAYERANY
-
-// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
-#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM)                            \
-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
-      ARGBTOY_SIMD(src_argb, dst_y, width - NUM);                              \
-      ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP,                            \
-                   dst_y + (width - NUM) * BPP, NUM);                          \
-    }
-
-#ifdef HAS_ARGBTOYROW_AVX2
-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
-YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
-YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
-YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
-#endif
-#ifdef HAS_ARGBTOYJROW_SSSE3
-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
-#endif
-#ifdef HAS_ARGBTOYROW_NEON
-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
-YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
-YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
-YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
-YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
-YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
-YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
-YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
-YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
-YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
-YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
-YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
-YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
-YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
-YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
-#endif
-#undef YANY
-
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
-      int n = width & ~MASK;                                                   \
-      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \
-      ARGBTOY_C(src_argb + n * SBPP,                                           \
-                dst_y  + n * BPP, width & MASK);                               \
-    }
-
-// Attenuate is destructive so last16 method can not be used due to overlap.
-#ifdef HAS_ARGBATTENUATEROW_SSSE3
-YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
-     4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_SSE2
-YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
-     4, 4, 3)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
-YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
-     4, 4, 3)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_AVX2
-YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
-     4, 4, 7)
-#endif
-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
-YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
-     4, 4, 7)
-#endif
-#ifdef HAS_ARGBATTENUATEROW_NEON
-YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
-     4, 4, 7)
-#endif
-#undef YANY
-
-// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
-#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);                \
-      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
-                dst_u + (n >> 1),                                              \
-                dst_v + (n >> 1),                                              \
-                width & MASK);                                                 \
-    }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
-UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
-UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
-      4, 15)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_NEON
-UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
-UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
-UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
-UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
-UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
-UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
-UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
-UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
-#endif
-#undef UVANY
-
-#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT)           \
-    void NAMEANY(const uint8* src_uv,                                          \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
-      ANYTOUV_C(src_uv  + n * BPP,                                             \
-                dst_u + (n >> SHIFT),                                          \
-                dst_v + (n >> SHIFT),                                          \
-                width & MASK);                                                 \
-    }
-
-#ifdef HAS_ARGBTOUV444ROW_SSSE3
-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
-         ARGBToUV444Row_C, 4, 15, 0)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_AVX2
-UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
-         YUY2ToUV422Row_C, 2, 31, 1)
-UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
-         UYVYToUV422Row_C, 2, 31, 1)
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
-         ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
-         YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
-         UYVYToUV422Row_C, 2, 15, 1)
-#endif
-#ifdef HAS_YUY2TOUV422ROW_NEON
-UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
-         ARGBToUV444Row_C, 4, 7, 0)
-UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
-         ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
-         ARGBToUV411Row_C, 4, 31, 2)
-UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
-         YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
-         UYVYToUV422Row_C, 2, 15, 1)
-#endif
-#undef UV422ANY
-
-#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \
-    void NAMEANY(const uint8* src_uv,                                          \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
-      ANYTOUV_C(src_uv + n * 2,                                                \
-                dst_u + n,                                                     \
-                dst_v + n,                                                     \
-                width & MASK);                                                 \
-    }
-
-#ifdef HAS_SPLITUVROW_SSE2
-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
-#endif
-#ifdef HAS_SPLITUVROW_AVX2
-SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
-#endif
-#ifdef HAS_SPLITUVROW_NEON
-SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
-#endif
-#ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
-              SplitUVRow_C, 15)
-#endif
-#undef SPLITUVROWANY
-
-#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                 \
-    void NAMEANY(const uint8* src_u, const uint8* src_v,                       \
-                 uint8* dst_uv, int width) {                                   \
-      int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                   \
-      ANYTOUV_C(src_u + n,                                                     \
-                src_v + n,                                                     \
-                dst_uv + n * 2,                                                \
-                width & MASK);                                                 \
-    }
-
+// Merge functions.
 #ifdef HAS_MERGEUVROW_SSE2
-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
-MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
-#undef MERGEUVROW_ANY
 
-#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK)                  \
-    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
-                 uint8* dst_argb, int width) {                                 \
-      int n = width & ~MASK;                                                   \
-      ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n);                        \
-      ARGBMATH_C(src_argb0 + n * 4,                                            \
-                 src_argb1 + n * 4,                                            \
-                 dst_argb + n * 4,                                             \
-                 width & MASK);                                                \
-    }
-
+// Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
-MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
-            3)
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
 #endif
 #ifdef HAS_ARGBADDROW_SSE2
-MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
 #endif
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
-MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
-            3)
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
 #endif
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
-MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
-            7)
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBADDROW_AVX2
-MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
-MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
-            7)
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBMULTIPLYROW_NEON
-MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
-            7)
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBADDROW_NEON
-MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSUBTRACTROW_NEON
-MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
-            7)
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #endif
-#undef MATHROW_ANY
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
 
-// Shuffle may want to work in place, so last16 method can not be used.
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
-    void NAMEANY(const uint8* src_argb, uint8* dst_argb,                       \
-                 const uint8* shuffler, int width) {                           \
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
       int n = width & ~MASK;                                                   \
-      ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                           \
-      ARGBTOY_C(src_argb + n * SBPP,                                           \
-                dst_argb  + n * BPP, shuffler, width & MASK);                  \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
     }
 
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
 #ifdef HAS_ARGBSHUFFLEROW_SSE2
-YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
-     ARGBShuffleRow_C, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
-     ARGBShuffleRow_C, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
-     ARGBShuffleRow_C, 4, 4, 15)
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
-YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
-     ARGBShuffleRow_C, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
 #endif
-#undef YANY
+#undef ANY11P
 
-// Interpolate may want to work in place, so last16 method can not be used.
-#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK)                      \
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
     void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
                  ptrdiff_t src_stride_ptr, int width,                          \
                  int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
       int n = width & ~MASK;                                                   \
-      TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr,                              \
-                n, source_y_fraction);                                         \
-      TERP_C(dst_ptr + n * BPP,                                                \
-             src_ptr + n * SBPP, src_stride_ptr,                               \
-             width & MASK, source_y_fraction);                                 \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
     }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
-     InterpolateRow_C, 1, 1, 32)
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
-     InterpolateRow_C, 1, 1, 15)
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
-     InterpolateRow_C, 1, 1, 15)
+ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
-     InterpolateRow_C, 1, 1, 15)
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
-     InterpolateRow_C, 1, 1, 3)
+ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
 #endif
-#undef NANY
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/row_common.cc b/libvpx/third_party/libyuv/source/row_common.cc
index fa2b752..4987589 100644
--- a/libvpx/third_party/libyuv/source/row_common.cc
+++ b/libvpx/third_party/libyuv/source/row_common.cc

@@ -199,6 +199,40 @@
   }
 }
 
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -385,6 +419,28 @@
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 #undef MAKEROWYJ
 
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+  }
+}
+
 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -926,7 +982,7 @@
   }
 }
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   // Copy a Y to RGB.
   int x;
   for (x = 0; x < width; ++x) {
@@ -938,33 +994,96 @@
   }
 }
 
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
 // C reference code that mimics the YUV assembly.
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
                               uint8* b, uint8* g, uint8* r) {
-  int32 y1 = ((int32)(y) - 16) * YG;
-  *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
-  *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
-  *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
+  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
 }
 
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
+  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
+  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
+  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
+}
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
 void I444ToARGBRow_C(const uint8* src_y,
@@ -1008,6 +1127,7 @@
   }
 }
 #endif
+
 // Also used for 420
 void I422ToARGBRow_C(const uint8* src_y,
                      const uint8* src_u,
@@ -1034,6 +1154,31 @@
   }
 }
 
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvJPixel(src_y[1], src_u[0], src_v[0],
+              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
 void I422ToRGB24Row_C(const uint8* src_y,
                       const uint8* src_u,
                       const uint8* src_v,
@@ -1233,23 +1378,23 @@
 }
 
 void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* usrc_v,
+                     const uint8* src_uv,
                      uint8* rgb_buf,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
     rgb_buf[7] = 255;
     src_y += 2;
-    usrc_v += 2;
+    src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
   }
@@ -1281,7 +1426,7 @@
 }
 
 void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* usrc_v,
+                       const uint8* src_uv,
                        uint8* dst_rgb565,
                        int width) {
   uint8 b0;
@@ -1292,8 +1437,8 @@
   uint8 r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
@@ -1303,11 +1448,11 @@
     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
         (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
-    usrc_v += 2;
+    src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
@@ -1467,21 +1612,18 @@
   }
 }
 
-void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], 128, 128,
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], 128, 128,
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
     rgb_buf[7] = 255;
     src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], 128, 128,
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
   }
 }
@@ -1569,28 +1711,15 @@
   memcpy(dst, src, count * 2);
 }
 
-void SetRow_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
-  // VC will generate rep stosb.
-  int x;
-  for (x = 0; x < count; ++x) {
-    dst[x] = v8;
-  }
-#else
-  memset(dst, v8, count);
-#endif
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
 }
 
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
-                 int dst_stride, int height) {
-  int y;
-  for (y = 0; y < height; ++y) {
-    uint32* d = (uint32*)(dst);
-    int x;
-    for (x = 0; x < width; ++x) {
-      d[x] = v32;
-    }
-    dst += dst_stride;
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
   }
 }
 
@@ -1885,17 +2014,17 @@
   }
 }
 
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-               uint8* dst_uv, int pix) {
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
   int x;
   for (x = 0; x < pix; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                  uint16* dst_uv, int pix) {
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int pix) {
   int x;
   for (x = 0; x < pix; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -1957,40 +2086,6 @@
   }
 }
 
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-void ARGBToBayerRow_C(const uint8* src_argb,
-                      uint8* dst_bayer, uint32 selector, int pix) {
-  int index0 = selector & 0xff;
-  int index1 = (selector >> 8) & 0xff;
-  // Copy a row of Bayer.
-  int x;
-  for (x = 0; x < pix - 1; x += 2) {
-    dst_bayer[0] = src_argb[index0];
-    dst_bayer[1] = src_argb[index1];
-    src_argb += 8;
-    dst_bayer += 2;
-  }
-  if (pix & 1) {
-    dst_bayer[0] = src_argb[index0];
-  }
-}
-
-// Select G channel from ARGB.  e.g.  GGGGGGGG
-void ARGBToBayerGGRow_C(const uint8* src_argb,
-                        uint8* dst_bayer, uint32 selector, int pix) {
-  // Copy a row of G.
-  int x;
-  for (x = 0; x < pix - 1; x += 2) {
-    dst_bayer[0] = src_argb[1];
-    dst_bayer[1] = src_argb[5];
-    src_argb += 8;
-    dst_bayer += 2;
-  }
-  if (pix & 1) {
-    dst_bayer[0] = src_argb[1];
-  }
-}
-
 // Use first 4 shuffler values to reorder ARGB channels.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
                       const uint8* shuffler, int pix) {
@@ -2033,7 +2128,7 @@
   if (width & 1) {
     dst_frame[0] = src_y[0];
     dst_frame[1] = src_u[0];
-    dst_frame[2] = src_y[0];  // duplicate last y
+    dst_frame[2] = 0;
     dst_frame[3] = src_v[0];
   }
 }
@@ -2057,126 +2152,321 @@
     dst_frame[0] = src_u[0];
     dst_frame[1] = src_y[0];
     dst_frame[2] = src_v[0];
-    dst_frame[3] = src_y[0];  // duplicate last y
+    dst_frame[3] = 0;
   }
 }
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
-#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
-                           uint8* rgb_buf,
+                           uint8* dst_rgb565,
                            int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToRGB565Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
-#endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
+#endif
 
-#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
-                             uint8* rgb_buf,
+                             uint8* dst_argb1555,
                              int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
-                             uint8* rgb_buf,
+                             uint8* dst_argb4444,
                              int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_rgb565,
-                           int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
-  free_aligned_buffer_64(row);
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_vu,
-                           uint8* dst_rgb565,
-                           int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
-  free_aligned_buffer_64(row);
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
-  YUY2ToYRow_SSE2(src_yuy2, row_y, width);
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
 }
+#endif
 
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
-                                   uint8* dst_argb,
-                                   int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
-  YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
 }
+#endif  // !defined(LIBYUV_DISABLE_X86)
 
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
-  UYVYToYRow_SSE2(src_uyvy, row_y, width);
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
-                                   uint8* dst_argb,
-                                   int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
-  UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-#endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORAWROW_AVX2)
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_raw,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRAWRow_AVX2
+    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_raw += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
 #endif  // !defined(LIBYUV_DISABLE_X86)
 
 void ARGBPolynomialRow_C(const uint8* src_argb,

diff --git a/libvpx/third_party/libyuv/source/row_posix.cc b/libvpx/third_party/libyuv/source/row_gcc.cc
similarity index 64%
rename from libvpx/third_party/libyuv/source/row_posix.cc
rename to libvpx/third_party/libyuv/source/row_gcc.cc
index 106fda5..820de0a 100644
--- a/libvpx/third_party/libyuv/source/row_posix.cc
+++ b/libvpx/third_party/libyuv/source/row_gcc.cc

@@ -1,3 +1,4 @@
+// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -92,6 +93,7 @@
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
+// 7 bit fixed point 0.5.
 static vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
@@ -221,7 +223,7 @@
   "1:                                          \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -229,47 +231,13 @@
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // TESTING
 
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int pix) {
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
@@ -291,14 +259,10 @@
   : "+r"(src_y),     // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
-#endif  // HAS_I400TOARGBROW_SSE2
+#endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
@@ -318,27 +282,24 @@
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -359,27 +320,24 @@
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -417,9 +375,8 @@
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -427,13 +384,8 @@
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -474,9 +426,8 @@
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -484,13 +435,8 @@
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -518,9 +464,8 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -528,13 +473,8 @@
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -572,10 +512,7 @@
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -613,10 +550,7 @@
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -631,7 +565,7 @@
     "pslld     $0xb,%%xmm5                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pslld     $0x8,%%xmm0                     \n"
@@ -652,11 +586,7 @@
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -672,7 +602,7 @@
     "pslld     $0xf,%%xmm7                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm3                   \n"
@@ -690,17 +620,14 @@
     "packssdw  %%xmm0,%%xmm0                   \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -712,7 +639,7 @@
     "psrlw     $0x8,%%xmm3                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm3,%%xmm0                   \n"
     "pand      %%xmm4,%%xmm1                   \n"
@@ -728,57 +655,17 @@
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
 #ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
     "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -796,68 +683,29 @@
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kARGBToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -874,53 +722,131 @@
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kARGBToYJ),  // %3
     "m"(kAddYJ64)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -942,52 +868,113 @@
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUVROW_SSSE3
 
-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                         uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToUJ),  // %0
-    "m"(kARGBToVJ),  // %1
-    "m"(kAddUVJ128)  // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1010,245 +997,32 @@
     "psraw     $0x8,%%xmm0                     \n"
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUVJROW_SSSE3
 
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),         // %0
-    "m"(kARGBToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                  uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToUJ),         // %0
-    "m"(kARGBToVJ),         // %1
-    "m"(kAddUVJ128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb))
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                           int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
-  );
-}
-
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
-                                    uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
@@ -1266,7 +1040,6 @@
     "psraw     $0x8,%%xmm2                     \n"
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
@@ -1283,98 +1056,30 @@
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),        // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
   );
 }
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
 
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
@@ -1403,26 +1108,23 @@
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUV422ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
@@ -1430,43 +1132,6 @@
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1482,116 +1147,41 @@
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kBGRAToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1613,24 +1203,21 @@
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
 
@@ -1640,43 +1227,6 @@
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1692,19 +1242,16 @@
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kABGRToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -1714,43 +1261,6 @@
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1766,116 +1276,41 @@
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgba),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kRGBAToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1897,121 +1332,46 @@
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
 
 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kRGBAToU),         // %0
-    "m"(kRGBAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba))
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kRGBAToU),         // %0
-    "m"(kRGBAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -2033,75 +1393,149 @@
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgba0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
-#endif  // HAS_ARGBTOUVROW_SSSE3
 
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-struct {
-  vec8 kUVToB;  // 0
-  vec8 kUVToG;  // 16
-  vec8 kUVToR;  // 32
-  vec16 kUVBiasB;  // 48
-  vec16 kUVBiasG;  // 64
-  vec16 kUVBiasR;  // 80
-  vec16 kYSub16;  // 96
-  vec16 kYToRgb;  // 112
-  vec8 kVUToB;  // 128
-  vec8 kVUToG;  // 144
-  vec8 kVUToR;  // 160
-} static SIMD_ALIGNED(kYuvConstants) = {
-  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR },
-  { 16, 16, 16, 16, 16, 16, 16, 16 },
-  { YG, YG, YG, YG, YG, YG, YG, YG },
-  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
 };
 
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
 
 // Read 8 UV from 411
 #define READYUV444                                                             \
     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"
@@ -2109,7 +1543,6 @@
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                             \
     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -2118,7 +1551,6 @@
 // Read 2 UV from 411, upsample to 8 UV
 #define READYUV411                                                             \
     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -2132,20 +1564,23 @@
     "punpcklwd  %%xmm0,%%xmm0                                   \n"
 
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB                                                               \
+#define YUVTORGB(YuvConstants)                                                 \
     "movdqa     %%xmm0,%%xmm1                                   \n"            \
     "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
-    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
+    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
     "paddsw     %%xmm3,%%xmm0                                   \n"            \
     "paddsw     %%xmm3,%%xmm1                                   \n"            \
     "paddsw     %%xmm3,%%xmm2                                   \n"            \
@@ -2156,30 +1591,51 @@
     "packuswb   %%xmm1,%%xmm1                                   \n"            \
     "packuswb   %%xmm2,%%xmm2                                   \n"
 
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
-    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
-    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR                                                              \
+    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
+    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
+    "movdqa    %%xmm2,%%xmm1                                     \n"           \
+    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
+    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
 
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
@@ -2189,19 +1645,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2210,41 +1658,25 @@
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
+// TODO(fbarchard): Consider putting masks into constants.
 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* dst_rgb24,
                                  int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
   asm volatile (
     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
-#endif
-
-  asm volatile (
-#if !defined(__i386__)
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-#endif
     "sub       %[u_buf],%[v_buf]               \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -2256,25 +1688,23 @@
     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
-    "sub       $0x8,%[width]                   \n"
+    "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
-    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-#endif
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
   );
 }
 
@@ -2283,26 +1713,14 @@
                                const uint8* v_buf,
                                uint8* dst_raw,
                                int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
   asm volatile (
     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
-  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
-#endif
-
-  asm volatile (
-#if !defined(__i386__)
-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
-#endif
     "sub       %[u_buf],%[v_buf]               \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -2314,25 +1732,23 @@
     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
-    "sub       $0x8,%[width]                   \n"
+    "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)    // %[width]
+#else
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
-    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
-#endif
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
   );
 }
 
@@ -2344,19 +1760,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2365,13 +1773,34 @@
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2383,19 +1812,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2404,13 +1825,8 @@
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2420,19 +1836,11 @@
                                 int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2440,11 +1848,8 @@
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
   // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2454,216 +1859,20 @@
                                 int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
+  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
   // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* uv_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-  // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* uv_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-  // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2675,20 +1884,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2697,13 +1897,8 @@
     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2715,19 +1910,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+    YUVTORGB(kYuvConstants)
+    STOREABGR
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2736,13 +1923,8 @@
     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2754,20 +1936,11 @@
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm2,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
+    YUVTORGB(kYuvConstants)
+    STORERGBA
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2776,159 +1949,275 @@
     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_bgra,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_abgr,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_rgba,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm2,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
 #endif  // HAS_I422TOARGBROW_SSSE3
 
-#ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* dst_argb,
-                     int width) {
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
+    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
+    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
+    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
+    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
+    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
+    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
+    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_bgra,
+                               int width) {
   asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "mov       $0x00100010,%%eax               \n"
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "mov       $0x004a004a,%%eax               \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
+
+    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
+    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_J422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
     LABELALIGN
   "1:                                          \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
     "psubusw   %%xmm3,%%xmm0                   \n"
-    "pmullw    %%xmm2,%%xmm0                   \n"
     "psrlw     $6, %%xmm0                      \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
 
@@ -2939,8 +2228,8 @@
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm4,%%xmm0                   \n"
     "por       %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
 
     "sub       $0x8,%2                         \n"
@@ -2950,12 +2239,57 @@
     "+rm"(width)     // %2
   :
   : "memory", "cc", "eax"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
   );
 }
-#endif  // HAS_YTOARGBROW_SSE2
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
@@ -2967,38 +2301,56 @@
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     LABELALIGN
   "1:                                          \n"
-    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
     "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   : "m"(kShuffleMirror) // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
   );
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
 #ifdef HAS_MIRRORROW_SSE2
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     LABELALIGN
   "1:                                          \n"
-    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
     "movdqa    %%xmm0,%%xmm1                   \n"
     "psllw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
@@ -3006,21 +2358,16 @@
     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1)",%1            \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_MIRRORROW_SSE2
@@ -3035,108 +2382,119 @@
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     "pshufb    %%xmm1,%%xmm0                   \n"
-    "sub       $8,%3                           \n"
     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
     "jg        1b                              \n"
   : "+r"(src),      // %0
     "+r"(dst_u),    // %1
     "+r"(dst_v),    // %2
     "+r"(temp_width)  // %3
   : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
+#ifdef HAS_ARGBMIRRORROW_SSE2
 
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    "movdqa    %3,%%xmm5                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
     "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror)  // %3
+  :
   : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
+    , "xmm0"
   );
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
   asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
     "sub        %1,%2                            \n"
     LABELALIGN
   "1:                                            \n"
-    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
     "jg         1b                               \n"
+    "vzeroupper                                  \n"
   : "+r"(src_uv),     // %0
     "+r"(dst_u),      // %1
     "+r"(dst_v),      // %2
     "+r"(pix)         // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
+#endif  // HAS_SPLITUVROW_AVX2
 
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix) {
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
@@ -3164,52 +2522,46 @@
     "+r"(dst_v),      // %2
     "+r"(pix)         // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
     "sub       %0,%1                             \n"
     LABELALIGN
   "1:                                            \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
     "jg        1b                                \n"
+    "vzeroupper                                  \n"
   : "+r"(src_u),     // %0
     "+r"(src_v),     // %1
     "+r"(dst_uv),    // %2
     "+r"(width)      // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
   );
 }
+#endif  // HAS_MERGEUVROW_AVX2
 
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width) {
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
   asm volatile (
     "sub       %0,%1                             \n"
     LABELALIGN
@@ -3230,13 +2582,8 @@
     "+r"(dst_uv),    // %2
     "+r"(width)      // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
   );
 }
 #endif  // HAS_MERGEUVROW_SSE2
@@ -3246,11 +2593,11 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
@@ -3259,30 +2606,36 @@
     "+r"(count)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_COPYROW_SSE2
 
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
-  size_t width_tmp = (size_t)(width);
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    "shr       $0x2,%2                         \n"
-    "rep movsl " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
   :
   : "memory", "cc"
+    , "xmm0", "xmm1"
   );
 }
-#endif  // HAS_COPYROW_X86
+#endif  // HAS_COPYROW_AVX
 
 #ifdef HAS_COPYROW_ERMS
-// Unaligned Multiple of 1.
+// Multiple of 1.
 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = (size_t)(width);
   asm volatile (
@@ -3306,19 +2659,19 @@
     "psrld     $0x8,%%xmm1                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
     "pand      %%xmm0,%%xmm2                   \n"
     "pand      %%xmm0,%%xmm3                   \n"
     "pand      %%xmm1,%%xmm4                   \n"
     "pand      %%xmm1,%%xmm5                   \n"
     "por       %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -3327,9 +2680,7 @@
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
@@ -3358,9 +2709,7 @@
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
@@ -3380,16 +2729,16 @@
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "punpckhwd %%xmm2,%%xmm3                   \n"
     "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
     "pand      %%xmm0,%%xmm2                   \n"
     "pand      %%xmm0,%%xmm3                   \n"
     "pand      %%xmm1,%%xmm4                   \n"
     "pand      %%xmm1,%%xmm5                   \n"
     "por       %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -3398,9 +2747,7 @@
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
@@ -3431,18 +2778,16 @@
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint32 v32, int width) {
-  size_t width_tmp = (size_t)(width);
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
   asm volatile (
-    "shr       $0x2,%1                         \n"
     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
     : "+D"(dst),       // %0
       "+c"(width_tmp)  // %1
@@ -3450,19 +2795,24 @@
     : "memory", "cc");
 }
 
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    size_t width_tmp = (size_t)(width);
-    uint32* d = (uint32*)(dst);
-    asm volatile (
-      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
-      : "+D"(d),         // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-    dst += dst_stride;
-  }
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
@@ -3473,13 +2823,13 @@
     "psrlw     $0x8,%%xmm5                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
@@ -3488,9 +2838,7 @@
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 
@@ -3502,11 +2850,10 @@
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
@@ -3519,7 +2866,6 @@
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3529,13 +2875,8 @@
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -3547,120 +2888,6 @@
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
-                                int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3673,7 +2900,6 @@
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3683,13 +2909,8 @@
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 
@@ -3697,24 +2918,22 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 
@@ -3726,120 +2945,8 @@
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(pix)          // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3854,7 +2961,6 @@
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3864,18 +2970,13 @@
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
@@ -3894,7 +2995,6 @@
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3904,17 +3004,218 @@
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -3928,41 +3229,7 @@
     "psllw     $0x8,%%xmm5                     \n"
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
     "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
-    "je        91f                             \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop until destination pointer is aligned.
-  "10:                                         \n"
-    "test      $0xf,%2                         \n"
-    "je        19f                             \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-    "add       $1-4,%3                         \n"
+    "sub       $0x4,%3                         \n"
     "jl        49f                             \n"
 
     // 4 pixel loop.
@@ -3988,9 +3255,9 @@
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
@@ -4019,9 +3286,9 @@
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -4030,9 +3297,7 @@
     "+r"(width)         // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBBLENDROW_SSE2
@@ -4065,75 +3330,12 @@
     "psllw     $0x8,%%xmm5                     \n"
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
     "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
-    "je        91f                             \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop until destination pointer is aligned.
-  "10:                                         \n"
-    "test      $0xf,%2                         \n"
-    "je        19f                             \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-    "add       $1-4,%3                         \n"
+    "sub       $0x4,%3                         \n"
     "jl        49f                             \n"
-    "test      $0xf,%0                         \n"
-    "jne       41f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       41f                             \n"
 
     // 4 pixel loop.
     LABELALIGN
   "40:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "jge       40b                             \n"
-    "jmp       49f                             \n"
-
-    // 4 pixel unaligned loop.
-    LABELALIGN
-  "41:                                         \n"
     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -4152,10 +3354,10 @@
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "jge       41b                             \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
 
   "49:                                         \n"
     "add       $0x3,%3                         \n"
@@ -4181,9 +3383,9 @@
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -4192,16 +3394,13 @@
     "+r"(width)         // %3
   : "m"(kShuffleAlpha)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
@@ -4212,17 +3411,17 @@
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "pand      %%xmm4,%%xmm2                   \n"
@@ -4230,18 +3429,16 @@
     "packuswb  %%xmm1,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
     "+r"(width)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_SSE2
@@ -4249,14 +3446,13 @@
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
 static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
 };
 static uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
 };
 // Attenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm3,%%xmm3                   \n"
@@ -4284,9 +3480,9 @@
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
@@ -4294,16 +3490,56 @@
   : "m"(kShuffleAlpha0),  // %3
     "m"(kShuffleAlpha1)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   uintptr_t alpha = 0;
@@ -4324,7 +3560,6 @@
     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
@@ -4334,26 +3569,90 @@
     "pmulhuw   %%xmm2,%%xmm1                   \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
     "+r"(width),       // %2
     "+r"(alpha)        // %3
   : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -4364,16 +3663,16 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "paddw     %%xmm5,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrld     $0x18,%%xmm2                    \n"
     "psrld     $0x18,%%xmm3                    \n"
@@ -4385,10 +3684,10 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm3,%%xmm0                   \n"
     "punpckhwd %%xmm3,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
@@ -4396,9 +3695,7 @@
   : "m"(kARGBToYJ),   // %3
     "m"(kAddYJ64)     // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
@@ -4430,30 +3727,30 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "pmaddubsw %%xmm2,%%xmm6                   \n"
     "phaddw    %%xmm6,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm3,%%xmm5                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm5                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "psrld     $0x18,%%xmm6                    \n"
     "psrld     $0x18,%%xmm1                    \n"
     "packuswb  %%xmm1,%%xmm6                   \n"
@@ -4462,10 +3759,10 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm5,%%xmm0                   \n"
     "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),      // %0
     "+r"(width)          // %1
@@ -4473,9 +3770,7 @@
     "m"(kARGBToSepiaG),  // %3
     "m"(kARGBToSepiaR)   // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
   );
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
@@ -4495,12 +3790,12 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm3,%%xmm6                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "phaddsw   %%xmm7,%%xmm0                   \n"
@@ -4510,13 +3805,13 @@
     "packuswb  %%xmm0,%%xmm0                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm7                   \n"
     "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm5,%%xmm6                   \n"
     "pmaddubsw %%xmm5,%%xmm7                   \n"
     "phaddsw   %%xmm7,%%xmm6                   \n"
@@ -4528,27 +3823,24 @@
     "movdqa    %%xmm0,%%xmm6                   \n"
     "punpcklwd %%xmm1,%%xmm0                   \n"
     "punpckhwd %%xmm1,%%xmm6                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),      // %0
     "+r"(dst_argb),      // %1
     "+r"(width)          // %2
   : "r"(matrix_argb)     // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   asm volatile (
@@ -4568,23 +3860,23 @@
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm5,%%xmm1                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
     "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
     "pand      %%xmm6,%%xmm7                   \n"
     "paddw     %%xmm4,%%xmm0                   \n"
     "paddw     %%xmm4,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm7,%%xmm0                   \n"
-    "sub       $0x4,%1                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),       // %0
     "+r"(width)           // %1
@@ -4592,16 +3884,13 @@
     "r"(interval_size),   // %3
     "r"(interval_offset)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
@@ -4612,7 +3901,7 @@
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -4622,18 +3911,16 @@
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(value)       // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
@@ -4643,7 +3930,7 @@
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm5,%%xmm5                  \n"
 
     // 4 pixel loop.
     LABELALIGN
@@ -4661,9 +3948,9 @@
     "pmulhuw   %%xmm2,%%xmm0                   \n"
     "pmulhuw   %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4671,12 +3958,49 @@
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
   );
 }
-#endif  // HAS_ARGBMULTIPLYROW_SSE2
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
@@ -4691,9 +4015,9 @@
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4701,13 +4025,39 @@
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -4721,9 +4071,9 @@
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psubusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4731,13 +4081,39 @@
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
 #ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
@@ -4759,13 +4135,11 @@
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
@@ -4778,10 +4152,9 @@
     "psubw     %%xmm0,%%xmm1                   \n"
     "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x8,%4                         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
     "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -4789,13 +4162,8 @@
     "+r"(dst_sobelx),  // %3
     "+r"(width)        // %4
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELXROW_SSE2
@@ -4820,13 +4188,11 @@
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
@@ -4839,23 +4205,17 @@
     "psubw     %%xmm0,%%xmm1                   \n"
     "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x8,%3                         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
     "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(dst_sobely),  // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELYROW_SSE2
@@ -4876,8 +4236,8 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -4893,25 +4253,20 @@
     "punpckhwd %%xmm0,%%xmm0                   \n"
     "por       %%xmm5,%%xmm3                   \n"
     "por       %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELROW_SSE2
@@ -4928,26 +4283,21 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
@@ -4967,8 +4317,8 @@
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "paddusb   %%xmm1,%%xmm2                   \n"
@@ -4984,25 +4334,20 @@
     "movdqa    %%xmm1,%%xmm7                   \n"
     "punpcklwd %%xmm0,%%xmm7                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_SOBELXYROW_SSE2
@@ -5035,22 +4380,22 @@
     "punpcklwd %%xmm1,%%xmm4                   \n"
     "punpckhwd %%xmm1,%%xmm5                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
     "paddd     %%xmm0,%%xmm2                   \n"
     "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
     "paddd     %%xmm0,%%xmm3                   \n"
     "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
     "paddd     %%xmm0,%%xmm4                   \n"
     "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
     "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
@@ -5082,9 +4427,7 @@
     "+r"(width)  // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
@@ -5115,11 +4458,10 @@
   // 4 pixel small loop                        \n"
     LABELALIGN
   "4:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    BUNDLEALIGN
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
@@ -5129,7 +4471,6 @@
     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
@@ -5149,11 +4490,10 @@
   // 4 pixel loop                              \n"
     LABELALIGN
   "40:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    BUNDLEALIGN
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
@@ -5163,7 +4503,6 @@
     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
@@ -5196,11 +4535,10 @@
   // 1 pixel loop                              \n"
     LABELALIGN
   "10:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
@@ -5219,13 +4557,8 @@
     "+rm"(count)    // %3
   : "r"((intptr_t)(width)),  // %4
     "rm"(area)     // %5
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -5268,7 +4601,6 @@
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
     "movd      %%xmm0,%k5                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm1                   \n"
@@ -5277,14 +4609,13 @@
     "movd      %%xmm0,%k1                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
     "movd      %%xmm0,%k5                      \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm0                   \n"
     "addps     %%xmm4,%%xmm3                   \n"
-    "sub       $0x4,%4                         \n"
     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
     "jge       40b                             \n"
 
   "49:                                         \n"
@@ -5299,11 +4630,10 @@
     "pmaddwd   %%xmm5,%%xmm0                   \n"
     "addps     %%xmm7,%%xmm2                   \n"
     "movd      %%xmm0,%k1                      \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "sub       $0x1,%4                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
     "jge       10b                             \n"
   "19:                                         \n"
   : "+r"(src_argb),  // %0
@@ -5313,13 +4643,8 @@
     "+rm"(width),    // %4
     "+r"(temp)   // %5
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
@@ -5352,8 +4677,8 @@
     // General purpose row blend.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
@@ -5362,61 +4687,57 @@
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
     LABELALIGN
   "25:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
     LABELALIGN
   "50:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
     LABELALIGN
   "75:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
     LABELALIGN
   "100:                                        \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5425,17 +4746,113 @@
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x80,%3                       \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
@@ -5465,8 +4882,8 @@
     // General purpose row blend.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
@@ -5482,242 +4899,9 @@
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride, int dst_width,
-                                    int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
-  );
-}
-#endif   // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride, int dst_width,
-                                   int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm2                   \n"
-    "paddw     %%xmm3,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
@@ -5728,10 +4912,9 @@
     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
@@ -5741,10 +4924,9 @@
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
@@ -5755,10 +4937,9 @@
     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
@@ -5766,9 +4947,9 @@
     LABELALIGN
   "100:                                        \n"
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5777,139 +4958,18 @@
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-#ifdef HAS_HALFROW_SSE2
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "jg        1b                              \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_uv),  // %1
-    "+r"(pix)      // %2
-  : "r"((intptr_t)(src_uv_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0"
-#endif
-  );
-}
-#endif  // HAS_HALFROW_SSE2
-
-#ifdef HAS_ARGBTOBAYERROW_SSSE3
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix) {
-  asm volatile (
-    // NaCL caveat - assumes movd is from GPR
-    "movd      %3,%%xmm5                       \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "sub       $0x8,%2                         \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_bayer), // %1
-    "+r"(pix)        // %2
-  : "g"(selector)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBTOBAYERROW_SSSE3
-
-#ifdef HAS_ARGBTOBAYERGGROW_SSE2
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 selector, int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrld     $0x8,%%xmm0                     \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x8,%2                         \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_bayer), // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBTOBAYERGGROW_SSE2
-
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   asm volatile (
-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix) {
-  asm volatile (
-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -5917,19 +4977,17 @@
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
@@ -5947,19 +5005,18 @@
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "sub       $0x10,%2                        \n"
     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
@@ -5989,7 +5046,6 @@
     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    BUNDLEALIGN
     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
@@ -6014,9 +5070,9 @@
     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        123b                            \n"
     "jmp       99f                             \n"
 
@@ -6032,9 +5088,9 @@
     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        321b                            \n"
     "jmp       99f                             \n"
 
@@ -6050,9 +5106,9 @@
     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        2103b                           \n"
     "jmp       99f                             \n"
 
@@ -6068,9 +5124,9 @@
     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        3012b                           \n"
 
   "99:                                         \n"
@@ -6079,13 +5135,8 @@
     "+d"(pixel_temp),  // %2
     "+r"(pix)         // %3
   : "r"(shuffler)      // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSE2
@@ -6119,13 +5170,8 @@
       "+r"(dst_frame),  // %3
       "+rm"(width)  // %4
     :
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
@@ -6159,13 +5205,8 @@
       "+r"(dst_frame),  // %3
       "+rm"(width)  // %4
     :
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
@@ -6212,18 +5253,16 @@
     "cvttps2dq %%xmm4,%%xmm4                   \n"
     "packuswb  %%xmm4,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x2,%2                         \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(poly)        // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
   );
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
@@ -6253,20 +5292,17 @@
     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "sub         $0x2,%2                       \n"
     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
     "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
     "jg          1b                            \n"
     "vzeroupper                                \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(poly)        // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-// TODO(fbarchard): declare ymm usage when applicable.
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
@@ -6376,7 +5412,6 @@
     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    BUNDLEALIGN
     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
@@ -6416,9 +5451,9 @@
     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "sub       $0x4,%4                         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
     "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
     "jg        1b                              \n"
   : "+d"(pixel_temp),  // %0
     "+a"(table_temp),  // %1
@@ -6427,10 +5462,7 @@
     "+rm"(width)       // %4
   : "r"(luma),         // %5
     "rm"(lumacoeff)    // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

diff --git a/libvpx/third_party/libyuv/source/row_mips.cc b/libvpx/third_party/libyuv/source/row_mips.cc
index ae9370c..cfc9ffe 100644
--- a/libvpx/third_party/libyuv/source/row_mips.cc
+++ b/libvpx/third_party/libyuv/source/row_mips.cc

@@ -378,7 +378,7 @@
 // MIPS DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
     (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
 
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
@@ -447,89 +447,6 @@
   );
 }
 
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
-                                     uint8* dst_v, int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-    ".p2align        2                             \n"
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lwr             $t0, 0(%[src_uv])             \n"
-    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lwr             $t1, 4(%[src_uv])             \n"
-    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lwr             $t2, 8(%[src_uv])             \n"
-    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
-    "lwr             $t3, 12(%[src_uv])            \n"
-    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lwr             $t5, 16(%[src_uv])            \n"
-    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lwr             $t6, 20(%[src_uv])            \n"
-    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lwr             $t7, 24(%[src_uv])            \n"
-    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lwr             $t8, 28(%[src_uv])            \n"
-    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "swr             $t9, 0(%[dst_v])              \n"
-    "swl             $t9, 3(%[dst_v])              \n"
-    "swr             $t0, 0(%[dst_u])              \n"
-    "swl             $t0, 3(%[dst_u])              \n"
-    "swr             $t1, 4(%[dst_v])              \n"
-    "swl             $t1, 7(%[dst_v])              \n"
-    "swr             $t2, 4(%[dst_u])              \n"
-    "swl             $t2, 7(%[dst_u])              \n"
-    "swr             $t3, 8(%[dst_v])              \n"
-    "swl             $t3, 11(%[dst_v])             \n"
-    "swr             $t5, 8(%[dst_u])              \n"
-    "swl             $t5, 11(%[dst_u])             \n"
-    "swr             $t6, 12(%[dst_v])             \n"
-    "swl             $t6, 15(%[dst_v])             \n"
-    "swr             $t7, 12(%[dst_u])             \n"
-    "swl             $t7, 15(%[dst_u])             \n"
-    "addiu           %[dst_u], %[dst_u], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_v], %[dst_v], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
   __asm__ __volatile__ (
     ".set push                             \n"
@@ -927,9 +844,9 @@
 }
 
 // Bilinear filter 8x2 -> 8x1
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                ptrdiff_t src_stride, int dst_width,
-                                int source_y_fraction) {
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
     int y0_fraction = 256 - source_y_fraction;
     const uint8* src_ptr1 = src_ptr + src_stride;
 

diff --git a/libvpx/third_party/libyuv/source/row_neon.cc b/libvpx/third_party/libyuv/source/row_neon.cc
index 1392cf5..1a72eb9 100644
--- a/libvpx/third_party/libyuv/source/row_neon.cc
+++ b/libvpx/third_party/libyuv/source/row_neon.cc

@@ -16,7 +16,8 @@
 #endif
 
 // This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
@@ -92,36 +93,79 @@
     "vuzp.u8    d2, d3                         \n"                             \
     "vtrn.u32   d2, d3                         \n"
 
-#define YUV422TORGB                                                            \
-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
-    "vtrn.u8    d0, d1                         \n"                             \
-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
-    "vmul.s16   q0, q0, q14                    \n"                             \
-    "vadd.s16   d18, d19                       \n"                             \
-    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
-    "vqadd.s16  d21, d1, d16                   \n"                             \
-    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
-    "vqadd.s16  d23, d1, d17                   \n"                             \
-    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
-    "vqadd.s16  d17, d1, d18                   \n"                             \
-    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
-    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
-    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
-    "vmovl.u8   q11, d1                        \n"                             \
-    "vmovl.u8   q8, d2                         \n"                             \
-    "vtrn.u8    d20, d21                       \n"                             \
-    "vtrn.u8    d22, d23                       \n"                             \
-    "vtrn.u8    d16, d17                       \n"                             \
-    "vmov.u8    d21, d16                       \n"
+#define YUV422TORGB_SETUP_REG                                                  \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
 
-static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                         0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+#define YUV422TORGB                                                            \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
+    "vadd.s16   d18, d19                       \n"                             \
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
+                          0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+                        0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
 
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
@@ -129,13 +173,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV444
@@ -150,9 +188,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -163,13 +203,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -184,9 +218,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -197,13 +233,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV411
@@ -218,9 +248,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -231,13 +263,7 @@
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -253,9 +279,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_bgra),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -266,13 +294,7 @@
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -288,9 +310,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_abgr),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -301,13 +325,7 @@
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -322,9 +340,11 @@
       "+r"(src_v),     // %2
       "+r"(dst_rgba),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -335,13 +355,7 @@
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -355,9 +369,11 @@
       "+r"(src_v),      // %2
       "+r"(dst_rgb24),  // %3
       "+r"(width)       // %4
-    : "r"(&kUVToRB),    // %5
-      "r"(&kUVToG)      // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -368,13 +384,7 @@
                        uint8* dst_raw,
                        int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -389,9 +399,11 @@
       "+r"(src_v),    // %2
       "+r"(dst_raw),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -414,13 +426,7 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -435,9 +441,11 @@
       "+r"(src_v),    // %2
       "+r"(dst_rgb565),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -463,13 +471,7 @@
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -485,9 +487,11 @@
       "+r"(src_v),    // %2
       "+r"(dst_argb1555),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -507,13 +511,7 @@
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
     ".p2align   2                              \n"
   "1:                                          \n"
@@ -530,24 +528,20 @@
       "+r"(src_v),    // %2
       "+r"(dst_argb4444),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void YToARGBRow_NEON(const uint8* src_y,
-                     uint8* dst_argb,
-                     int width) {
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV400
@@ -560,19 +554,21 @@
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
+void J400ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    ".p2align   2                              \n"
     "vmov.u8    d23, #255                      \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "vld1.8     {d20}, [%0]!                   \n"
@@ -595,13 +591,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
@@ -615,9 +605,11 @@
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -627,13 +619,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
@@ -647,9 +633,11 @@
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -659,13 +647,7 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
@@ -679,9 +661,11 @@
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -691,13 +675,7 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
@@ -711,9 +689,11 @@
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -722,13 +702,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUY2
@@ -741,9 +715,11 @@
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -752,13 +728,7 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READUYVY
@@ -771,9 +741,11 @@
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -844,12 +816,28 @@
   );
 }
 
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
     "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-    "1:                                        \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
     MEMACCESS(0)
     "vst1.8    {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
@@ -860,16 +848,6 @@
   );
 }
 
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
-}
-
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
@@ -1273,72 +1251,6 @@
   );
 }
 
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),         // %0
-    "+r"(src_uv_stride),  // %1
-    "+r"(dst_uv),         // %2
-    "+r"(pix)             // %3
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) {
-  asm volatile (
-    "vmov.u32   d6[0], %3                      \n"  // selector
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-// Select G channels from ARGB.  e.g.  GGGGGGGG
-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 /*selector*/, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
@@ -1435,6 +1347,30 @@
   );
 }
 
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                             int pix) {
   asm volatile (
@@ -2832,7 +2768,7 @@
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
     "vmovl.u8   q9, d18                        \n"  // g
     "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q15, d22                       \n"  // a
+    "vmovl.u8   q11, d22                       \n"  // a
     "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
     "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
     "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
@@ -2853,10 +2789,10 @@
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
     "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
@@ -2872,7 +2808,7 @@
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
     "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -3140,7 +3076,7 @@
   : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/row_neon64.cc b/libvpx/third_party/libyuv/source/row_neon64.cc
index 46e9ceb..5d01545 100644
--- a/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/libvpx/third_party/libyuv/source/row_neon64.cc

@@ -1,5 +1,5 @@
 /*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -15,113 +15,157 @@
 extern "C" {
 #endif
 
-// This module is for GCC Neon
+// This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
+    "ld1        {v1.s}[1], [%2], #4            \n"
 
 // Read 8 Y, 2 U and 2 V from 422
 #define READYUV411                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
 #define READYUY2                                                               \
     MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
 #define READUYVY                                                               \
     MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
-#define YUV422TORGB                                                            \
-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
-    "vtrn.u8    d0, d1                         \n"                             \
-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
-    "vmul.s16   q0, q0, q14                    \n"                             \
-    "vadd.s16   d18, d19                       \n"                             \
-    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
-    "vqadd.s16  d21, d1, d16                   \n"                             \
-    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
-    "vqadd.s16  d23, d1, d17                   \n"                             \
-    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
-    "vqadd.s16  d17, d1, d18                   \n"                             \
-    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
-    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
-    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
-    "vmovl.u8   q11, d1                        \n"                             \
-    "vmovl.u8   q8, d2                         \n"                             \
-    "vtrn.u8    d20, d21                       \n"                             \
-    "vtrn.u8    d22, d23                       \n"                             \
-    "vtrn.u8    d16, d17                       \n"                             \
-    "vmov.u8    d21, d16                       \n"
+#define YUV422TORGB_SETUP_REG                                                  \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "movi       v27.8h, #128                   \n"                             \
+    "movi       v28.8h, #102                   \n"                             \
+    "movi       v29.8h, #25                    \n"                             \
+    "movi       v30.8h, #52                    \n"
 
-static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                         0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+#define YUV422TORGB(vR, vG, vB)                                                \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
 
 #ifdef HAS_I444TOARGBROW_NEON
 void I444ToARGBRow_NEON(const uint8* src_y,
@@ -130,31 +174,24 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV444
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                 \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I444TOARGBROW_NEON
@@ -166,31 +203,24 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGBROW_NEON
@@ -202,31 +232,24 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV411
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I411TOARGBROW_NEON
@@ -238,32 +261,24 @@
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d19, #255                      \n"
+    YUV422TORGB(v21, v22, v23)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_bgra),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOBGRAROW_NEON
@@ -275,32 +290,24 @@
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_abgr),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOABGRROW_NEON
@@ -312,31 +319,24 @@
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"
+    YUV422TORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_rgba),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGBAROW_NEON
@@ -348,30 +348,23 @@
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
     MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : "r"(&kUVToRB),    // %5
-      "r"(&kUVToG)      // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGB24ROW_NEON
@@ -383,46 +376,33 @@
                        uint8* dst_raw,
                        int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
     MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_raw),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_raw),   // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORAWROW_NEON
 
 #define ARGBTORGB565                                                           \
-    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
-    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
 
 #ifdef HAS_I422TORGB565ROW_NEON
 void I422ToRGB565Row_NEON(const uint8* src_y,
@@ -431,49 +411,36 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
     ARGBTORGB565
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_rgb565),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGB565ROW_NEON
 
 #define ARGBTOARGB1555                                                         \
-    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
-    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
-    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
 
 #ifdef HAS_I422TOARGB1555ROW_NEON
 void I422ToARGB1555Row_NEON(const uint8* src_y,
@@ -482,44 +449,38 @@
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     ARGBTOARGB1555
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_argb1555),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGB1555ROW_NEON
 
 #define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
 
 #ifdef HAS_I422TOARGB4444ROW_NEON
 void I422ToARGB4444Row_NEON(const uint8* src_y,
@@ -528,93 +489,79 @@
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     ARGBTOARGB4444
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_argb4444),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGB4444ROW_NEON
 
-#ifdef HAS_YTOARGBROW_NEON
-void YToARGBRow_NEON(const uint8* src_y,
-                     uint8* dst_argb,
-                     int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-    READYUV400
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-#endif  // HAS_YTOARGBROW_NEON
-
 #ifdef HAS_I400TOARGBROW_NEON
 void I400ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
+  int64 width64 = (int64)(width);
   asm volatile (
-    ".p2align   2                              \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_J400TOARGBROW_NEON
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
-    "subs       %2, %2, #8                     \n"
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
     :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
+    : "cc", "memory", "v20", "v21", "v22", "v23"
   );
 }
-#endif  // HAS_I400TOARGBROW_NEON
+#endif  // HAS_J400TOARGBROW_NEON
 
 #ifdef HAS_NV12TOARGBROW_NEON
 void NV12ToARGBRow_NEON(const uint8* src_y,
@@ -622,30 +569,23 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV12
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV12TOARGBROW_NEON
@@ -656,30 +596,23 @@
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV21
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV21TOARGBROW_NEON
@@ -690,30 +623,23 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV12
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
     ARGBTORGB565
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV12TORGB565ROW_NEON
@@ -724,30 +650,23 @@
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV21
-    YUV422TORGB
-    "subs       %3, %3, #8                     \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
     ARGBTORGB565
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV21TORGB565ROW_NEON
@@ -756,30 +675,24 @@
 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                         uint8* dst_argb,
                         int width) {
+  int64 width64 = (int64)(width);
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUY2
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_YUY2TOARGBROW_NEON
@@ -788,30 +701,24 @@
 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                         uint8* dst_argb,
                         int width) {
+  int64 width64 = (int64)(width);
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READUYVY
-    YUV422TORGB
-    "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_UYVYTOARGBROW_NEON
@@ -821,22 +728,21 @@
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
     MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
-    "bgt        1b                             \n"
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_SPLITUVROW_NEON
@@ -846,23 +752,22 @@
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
     MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
     :
       "+r"(src_u),   // %0
       "+r"(src_v),   // %1
       "+r"(dst_uv),  // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_MERGEUVROW_NEON
@@ -871,77 +776,76 @@
 #ifdef HAS_COPYROW_NEON
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-    "bgt        1b                             \n"
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2  // Output registers
   :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_COPYROW_NEON
 
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-#ifdef HAS_SETROW_NEON
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
   asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-    "1:                                        \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
     MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
   : "r"(v32)     // %2
-  : "cc", "memory", "q0"
+  : "cc", "memory", "v0"
   );
 }
-#endif  // HAS_SETROW_NEON
-
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-#ifdef HAS_ARGBSETROWS_NEON
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
-}
-#endif  // HAS_ARGBSETROWS_NEON
 
 #ifdef HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_MIRRORROW_NEON
@@ -949,57 +853,56 @@
 #ifdef HAS_MIRRORUVROW_NEON
 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                       int width) {
+  int64 width64 = (int64) width;
   asm volatile (
     // Start at end of source row.
-    "mov        r12, #-16                      \n"
     "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_u),   // %1
     "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
+    "+r"(width64)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_MIRRORUVROW_NEON
 
 #ifdef HAS_ARGBMIRRORROW_NEON
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +910,19 @@
 #ifdef HAS_RGB24TOARGBROW_NEON
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v4.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),   // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_RGB24TOARGBROW_NEON
@@ -1028,138 +930,147 @@
 #ifdef HAS_RAWTOARGBROW_NEON
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v5.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_RAWTOARGBROW_NEON
 
 #define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
 
 #ifdef HAS_RGB565TOARGBROW_NEON
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v3.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     RGB565TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
   );
 }
 #endif  // HAS_RGB565TOARGBROW_NEON
 
 #define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
 #define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
 
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v3.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGB1555TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_ARGB1555TOARGBROW_NEON
 
 #define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
 
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGB4444TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_ARGB4444TOARGBROW_NEON
@@ -1167,19 +1078,18 @@
 #ifdef HAS_ARGBTORGB24ROW_NEON
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_rgb24),  // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORGB24ROW_NEON
@@ -1187,20 +1097,20 @@
 #ifdef HAS_ARGBTORAWROW_NEON
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-    "bgt        1b                             \n"
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_raw),   // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORAWROW_NEON
@@ -1208,19 +1118,18 @@
 #ifdef HAS_YUY2TOYROW_NEON
 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOYROW_NEON
@@ -1228,19 +1137,18 @@
 #ifdef HAS_UYVYTOYROW_NEON
 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOYROW_NEON
@@ -1249,22 +1157,21 @@
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUV422ROW_NEON
@@ -1273,22 +1180,21 @@
 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUV422ROW_NEON
@@ -1296,29 +1202,29 @@
 #ifdef HAS_YUY2TOUVROW_NEON
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
+    "+r"(src_yuy2b),    // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUVROW_NEON
@@ -1326,126 +1232,53 @@
 #ifdef HAS_UYVYTOUVROW_NEON
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
+    "+r"(src_uyvyb),    // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUVROW_NEON
 
-#ifdef HAS_HALFROW_NEON
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),         // %0
-    "+r"(src_uv_stride),  // %1
-    "+r"(dst_uv),         // %2
-    "+r"(pix)             // %3
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-#endif  // HAS_HALFROW_NEON
-
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-#ifdef HAS_ARGBTOBAYERROW_NEON
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) {
-  asm volatile (
-    "vmov.u32   d6[0], %3                      \n"  // selector
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTOBAYERROW_NEON
-
-// Select G channels from ARGB.  e.g.  GGGGGGGG
-#ifdef HAS_ARGBTOBAYERGGROW_NEON
-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 /*selector*/, int pix) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-#endif  // HAS_ARGBTOBAYERGGROW_NEON
-
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-    "bgt        1b                             \n"
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_NEON
@@ -1456,25 +1289,25 @@
                         const uint8* src_v,
                         uint8* dst_yuy2, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
     "+r"(src_v),     // %2
     "+r"(dst_yuy2),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOYUY2ROW_NEON
@@ -1485,25 +1318,25 @@
                         const uint8* src_v,
                         uint8* dst_uyvy, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
     "+r"(src_v),     // %2
     "+r"(dst_uyvy),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOUYVYROW_NEON
@@ -1511,20 +1344,44 @@
 #ifdef HAS_ARGBTORGB565ROW_NEON
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGBTORGB565
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_rgb565),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTORGB565ROW_NEON
@@ -1533,20 +1390,19 @@
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                             int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGBTOARGB1555
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb1555),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTOARGB1555ROW_NEON
@@ -1555,21 +1411,20 @@
 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
                             int pix) {
   asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGBTOARGB4444
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),      // %0
     "+r"(dst_argb4444),  // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTOARGB4444ROW_NEON
@@ -1577,28 +1432,27 @@
 #ifdef HAS_ARGBTOYROW_NEON
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1460,25 @@
 #ifdef HAS_ARGBTOYJROW_NEON
 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBTOYJROW_NEON
@@ -1635,41 +1488,41 @@
 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
 
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
   );
 }
 #endif  // HAS_ARGBTOUV444ROW_NEON
@@ -1679,49 +1532,41 @@
 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
 
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
 
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
 
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
 
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUV422ROW_NEON
@@ -1731,128 +1576,108 @@
 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUV411ROW_NEON
 
 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
 #ifdef HAS_ARGBTOUVROW_NEON
 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
+    "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUVROW_NEON
@@ -1861,50 +1686,45 @@
 #ifdef HAS_ARGBTOUVJROW_NEON
 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
+    "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUVJROW_NEON
@@ -1912,50 +1732,40 @@
 #ifdef HAS_BGRATOUVROW_NEON
 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q3, q2, q1)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
+    "+r"(src_bgra_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_BGRATOUVROW_NEON
@@ -1963,50 +1773,40 @@
 #ifdef HAS_ABGRTOUVROW_NEON
 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
+    "+r"(src_abgr_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ABGRTOUVROW_NEON
@@ -2014,50 +1814,40 @@
 #ifdef HAS_RGBATOUVROW_NEON
 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
+    "+r"(src_rgba_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RGBATOUVROW_NEON
@@ -2065,50 +1855,40 @@
 #ifdef HAS_RGB24TOUVROW_NEON
 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
-    "+r"(src_stride_rgb24),  // %1
+    "+r"(src_rgb24_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RGB24TOUVROW_NEON
@@ -2116,50 +1896,40 @@
 #ifdef HAS_RAWTOUVROW_NEON
 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
 
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
-    "+r"(src_stride_raw),  // %1
+    "+r"(src_raw_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RAWTOUVROW_NEON
@@ -2168,70 +1938,74 @@
 #ifdef HAS_RGB565TOUVROW_NEON
 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
                         uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
+    "+r"(src_rgb565_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
   );
 }
 #endif  // HAS_RGB565TOUVROW_NEON
@@ -2240,70 +2014,69 @@
 #ifdef HAS_ARGB1555TOUVROW_NEON
 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
                         uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
+    "+r"(src_argb1555_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
   );
 }
 #endif  // HAS_ARGB1555TOUVROW_NEON
@@ -2312,70 +2085,70 @@
 #ifdef HAS_ARGB4444TOUVROW_NEON
 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
                           uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
+    "+r"(src_argb4444_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
   );
 }
 #endif  // HAS_ARGB4444TOUVROW_NEON
@@ -2383,29 +2156,29 @@
 #ifdef HAS_RGB565TOYROW_NEON
 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
     "+r"(dst_y),       // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
   );
 }
 #endif  // HAS_RGB565TOYROW_NEON
@@ -2413,29 +2186,28 @@
 #ifdef HAS_ARGB1555TOYROW_NEON
 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
     "+r"(dst_y),         // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGB1555TOYROW_NEON
@@ -2443,29 +2215,28 @@
 #ifdef HAS_ARGB4444TOYROW_NEON
 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
     ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
     "+r"(dst_y),         // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
   );
 }
 #endif  // HAS_ARGB4444TOYROW_NEON
@@ -2473,28 +2244,27 @@
 #ifdef HAS_BGRATOYROW_NEON
 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_BGRATOYROW_NEON
@@ -2502,28 +2272,27 @@
 #ifdef HAS_ABGRTOYROW_NEON
 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
+    "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_ABGRTOYROW_NEON
@@ -2531,28 +2300,27 @@
 #ifdef HAS_RGBATOYROW_NEON
 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
+    "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RGBATOYROW_NEON
@@ -2560,28 +2328,27 @@
 #ifdef HAS_RGB24TOYROW_NEON
 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RGB24TOYROW_NEON
@@ -2589,28 +2356,27 @@
 #ifdef HAS_RAWTOYROW_NEON
 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
+    "+r"(dst_y),    // %1
+    "+r"(pix)       // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RAWTOYROW_NEON
@@ -2620,96 +2386,98 @@
 void InterpolateRow_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
   asm volatile (
-    "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
-    "cmp        %4, #64                        \n"
-    "beq        75f                            \n"
-    "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
-    "cmp        %4, #192                       \n"
-    "beq        25f                            \n"
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #64                       \n"
+    "b.eq       75f                            \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+    "cmp        %w4, #192                      \n"
+    "b.eq       25f                            \n"
 
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
     // General purpose row blend.
   "1:                                          \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
     "b          99f                            \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        25b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       25b                            \n"
     "b          99f                            \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
     "b          99f                            \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
+    "ld1        {v1.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q0}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "ld1        {v0.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        75b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       75b                            \n"
     "b          99f                            \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    "subs       %3, %3, #16                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
 
   "99:                                         \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
+    "+r"(src_ptr1),         // %2
     "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
   :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_NEON
@@ -2719,55 +2487,59 @@
 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
   asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
     // Blend 8 pixels.
   "8:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
 
   "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
 
     // Blend 1 pixels.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
     MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
     MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
 
   "99:                                         \n"
 
@@ -2776,7 +2548,8 @@
     "+r"(dst_argb),     // %2
     "+r"(width)         // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
   );
 }
 #endif  // HAS_ARGBBLENDROW_NEON
@@ -2788,22 +2561,22 @@
     // Attenuate 8 pixels.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_NEON
@@ -2814,41 +2587,40 @@
 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
 
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
     MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),       // %0
     "+r"(width)           // %1
   : "r"(scale),           // %2
     "r"(interval_size),   // %3
     "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBQUANTIZEROW_NEON
@@ -2860,36 +2632,35 @@
 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
 
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb),       // %0
     "+r"(dst_argb),       // %1
     "+r"(width)           // %2
   : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBSHADEROW_NEON
@@ -2900,28 +2671,27 @@
 #ifdef HAS_ARGBGRAYROW_NEON
 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
   );
 }
 #endif  // HAS_ARGBGRAYROW_NEON
@@ -2934,40 +2704,39 @@
 #ifdef HAS_ARGBSEPIAROW_NEON
 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-    ".p2align   2                              \n"
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
     MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),  // %0
     "+r"(width)      // %1
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_ARGBSEPIAROW_NEON
@@ -2980,60 +2749,59 @@
                              const int8* matrix_argb, int width) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q15, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
     MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_NEON
@@ -3045,31 +2813,30 @@
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBMULTIPLYROW_NEON
@@ -3080,25 +2847,26 @@
                      uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBADDROW_NEON
@@ -3109,25 +2877,26 @@
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +2910,26 @@
 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELROW_NEON
@@ -3172,23 +2940,22 @@
                           uint8* dst_y, int width) {
   asm volatile (
     // 16 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +2969,24 @@
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELXYROW_NEON
@@ -3233,40 +2999,39 @@
 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
     MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(src_y2),      // %2
     "+r"(dst_sobelx),  // %3
     "+r"(width)        // %4
-  : "r"(2),            // %5
-    "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELXROW_NEON
@@ -3279,39 +3044,38 @@
 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-    "bgt        1b                             \n"
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(dst_sobely),  // %2
     "+r"(width)        // %3
-  : "r"(1),            // %4
-    "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELYROW_NEON

diff --git a/libvpx/third_party/libyuv/source/row_win.cc b/libvpx/third_party/libyuv/source/row_win.cc
index 8eb8889..71be268 100644
--- a/libvpx/third_party/libyuv/source/row_win.cc
+++ b/libvpx/third_party/libyuv/source/row_win.cc

@@ -10,7 +10,8 @@
 
 #include "libyuv/row.h"
 
-#if defined (_M_X64)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+    defined(_MSC_VER) && !defined(__clang__)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -21,67 +22,139 @@
 #endif
 
 // This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__)
 
-#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127  /* min(127,(int8)(2.018 * 64)) */
-#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static const vec8 kUVToB = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
 };
 
-static const vec8 kUVToR = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
-static const vec8 kUVToG = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
-static const vec8 kVUToB = {
-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
 };
 
-static const vec8 kVUToR = {
-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
 
 // 64 bit
 #if defined(_M_X64)
-
-// Aligned destination version.
-__declspec(align(16))
+#if defined(HAS_I422TOARGBROW_SSSE3)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
-
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
-  const __m128i xmm4 = _mm_setzero_si128();
   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
 
   while (width > 0) {
@@ -89,18 +162,17 @@
     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm2 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
-    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
-    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
-    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm2 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
     xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
-    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
     xmm0 = _mm_adds_epi16(xmm0, xmm3);
     xmm1 = _mm_adds_epi16(xmm1, xmm3);
     xmm2 = _mm_adds_epi16(xmm2, xmm3);
@@ -112,61 +184,7 @@
     xmm2 = _mm_packus_epi16(xmm2, xmm2);
     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
-    _mm_store_si128((__m128i *)dst_argb, xmm0);
-    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
-
-    y_buf += 8;
-    u_buf += 4;
-    dst_argb += 32;
-    width -= 8;
-  }
-}
-
-// Unaligned destination version.
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-
-  __m128i xmm0, xmm1, xmm2, xmm3;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const __m128i xmm4 = _mm_setzero_si128();
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
-  while (width > 0) {
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm2 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
-    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
-    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
-    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
-    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
-    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
-    xmm0 = _mm_adds_epi16(xmm0, xmm3);
-    xmm1 = _mm_adds_epi16(xmm1, xmm3);
-    xmm2 = _mm_adds_epi16(xmm2, xmm3);
-    xmm0 = _mm_srai_epi16(xmm0, 6);
-    xmm1 = _mm_srai_epi16(xmm1, 6);
-    xmm2 = _mm_srai_epi16(xmm2, 6);
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
-    xmm1 = _mm_load_si128(&xmm0);
+    xmm1 = _mm_loadu_si128(&xmm0);
     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
 
@@ -179,9 +197,9 @@
     width -= 8;
   }
 }
+#endif
 // 32 bit
 #else  // defined(_M_X64)
-
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
@@ -210,15 +228,10 @@
   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
 };
 
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
-
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
 };
 
 // Constants for BGRA.
@@ -264,6 +277,7 @@
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
+// 7 bit fixed point 0.5.
 static const vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
@@ -308,8 +322,8 @@
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
     mov        edx, [esp + 8]        // dst_argb
@@ -317,36 +331,6 @@
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
-    align      4
-  convertloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm5
-    por        xmm1, xmm5
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int pix) {
-  __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
-    pslld      xmm5, 24
-
-    align      4
   convertloop:
     movq       xmm0, qword ptr [eax]
     lea        eax,  [eax + 8]
@@ -365,7 +349,39 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // pix
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_rgb24
@@ -375,7 +391,6 @@
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRGB24ToARGB
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -387,24 +402,24 @@
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                         int pix) {
   __asm {
@@ -415,7 +430,6 @@
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRAWToARGB
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -427,18 +441,18 @@
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -451,7 +465,7 @@
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                           int pix) {
   __asm {
@@ -475,7 +489,6 @@
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
@@ -492,8 +505,8 @@
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -501,8 +514,155 @@
   }
 }
 
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // pix
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // pix
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // pix
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
 // 24 instructions
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   __asm {
@@ -525,7 +685,6 @@
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
@@ -546,8 +705,8 @@
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -556,7 +715,7 @@
 }
 
 // 18 instructions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   __asm {
@@ -571,7 +730,6 @@
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
@@ -586,8 +744,8 @@
     movdqa    xmm1, xmm0
     punpcklbw xmm0, xmm2
     punpckhbw xmm1, xmm2
-    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -595,7 +753,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -603,7 +761,6 @@
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRGB24
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
@@ -634,7 +791,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -642,7 +799,6 @@
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRAW
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
@@ -673,7 +829,8 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+// 4 pixels
+__declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -687,9 +844,8 @@
     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
     pslld     xmm5, 11
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     pslld     xmm0, 8       // R
@@ -711,8 +867,97 @@
   }
 }
 
+// 8 pixels
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // pix
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // pix
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -727,9 +972,8 @@
     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
     pslld     xmm7, 15
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     movdqa    xmm3, xmm0    // R
@@ -754,7 +998,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -765,14 +1009,13 @@
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
     pand      xmm0, xmm3    // low nibble
     pand      xmm1, xmm4    // high nibble
-    psrl      xmm0, 4
-    psrl      xmm1, 8
+    psrld     xmm0, 4
+    psrld     xmm1, 8
     por       xmm0, xmm1
     packuswb  xmm0, xmm0
     lea       eax, [eax + 16]
@@ -784,22 +1027,129 @@
   }
 }
 
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // pix
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
+    movdqa     xmm5, kAddY16
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -811,16 +1161,17 @@
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) __declspec(align(16))
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -829,12 +1180,11 @@
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -847,17 +1197,22 @@
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
 #ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -865,9 +1220,8 @@
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToY
     vbroadcastf128 ymm5, kAddY16
-    vmovdqa    ymm6, kPermdARGBToY_AVX
+    vmovdqu    ymm6, kPermdARGBToY_AVX
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -884,10 +1238,10 @@
     vpsrlw     ymm2, ymm2, 7
     vpackuswb  ymm0, ymm0, ymm2  // mutates.
     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vpaddb     ymm0, ymm0, ymm5
-    sub        ecx, 32
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
     vzeroupper
     ret
@@ -895,9 +1249,9 @@
 }
 #endif  //  HAS_ARGBTOYROW_AVX2
 
-#ifdef HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -905,9 +1259,8 @@
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToYJ
     vbroadcastf128 ymm5, kAddYJ64
-    vmovdqa    ymm6, kPermdARGBToY_AVX
+    vmovdqu    ymm6, kPermdARGBToY_AVX
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -926,9 +1279,9 @@
     vpsrlw     ymm2, ymm2, 7
     vpackuswb  ymm0, ymm0, ymm2  // mutates.
     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
 
     vzeroupper
@@ -937,119 +1290,15 @@
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kARGBToY
-
-    align      4
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kARGBToYJ
-    movdqa     xmm5, kAddYJ64
-
-    align      4
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    paddw      xmm0, xmm5
-    paddw      xmm2, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kBGRAToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1066,58 +1315,23 @@
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kABGRToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1134,58 +1348,23 @@
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kRGBAToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1202,15 +1381,15 @@
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1221,22 +1400,26 @@
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1264,10 +1447,10 @@
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1276,7 +1459,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                         uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1287,22 +1470,26 @@
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToUJ
-    movdqa     xmm6, kARGBToVJ
     movdqa     xmm5, kAddUVJ128
+    movdqa     xmm6, kARGBToVJ
+    movdqa     xmm7, kARGBToUJ
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1331,10 +1518,10 @@
     packsswb   xmm0, xmm1
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1344,7 +1531,7 @@
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) __declspec(align(32))
+__declspec(naked)
 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1360,7 +1547,6 @@
     vbroadcastf128 ymm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
@@ -1396,10 +1582,10 @@
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
     // step 3 - store 16 U and 16 V values
-    sub         ecx, 32
     vextractf128 [edx], ymm0, 0 // U
     vextractf128 [edx + edi], ymm0, 1 // V
     lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
@@ -1410,148 +1596,7 @@
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
-__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToUJ
-    movdqa     xmm6, kARGBToVJ
-    movdqa     xmm5, kAddUVJ128
-    sub        edi, edx             // stride from u to v
-
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
-    paddw      xmm1, xmm5
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1560,70 +1605,11 @@
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
- convertloop:
-    /* convert to U and V */
-    movdqa     xmm0, [eax]          // U
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx,  16
-    movdqa     [edx], xmm0
-
-    movdqa     xmm0, [eax]          // V
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm6
-    pmaddubsw  xmm1, xmm6
-    pmaddubsw  xmm2, xmm6
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    lea        eax,  [eax + 64]
-    movdqa     [edx + edi], xmm0
-    lea        edx,  [edx + 16]
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
  convertloop:
     /* convert to U and V */
     movdqu     xmm0, [eax]          // U
@@ -1640,7 +1626,6 @@
     psraw      xmm2, 8
     packsswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx,  16
     movdqu     [edx], xmm0
 
     movdqu     xmm0, [eax]          // V
@@ -1660,6 +1645,7 @@
     lea        eax,  [eax + 64]
     movdqu     [edx + edi], xmm0
     lea        edx,  [edx + 16]
+    sub        ecx,  16
     jg         convertloop
 
     pop        edi
@@ -1667,7 +1653,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1676,71 +1662,11 @@
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1774,10 +1700,10 @@
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1785,7 +1711,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1796,92 +1722,26 @@
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm7, kBGRAToU
     sub        edi, edx             // stride from u to v
 
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1909,10 +1769,10 @@
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1921,7 +1781,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1932,92 +1792,26 @@
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm7, kABGRToU
     sub        edi, edx             // stride from u to v
 
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -2045,10 +1839,10 @@
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -2057,7 +1851,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -2068,92 +1862,26 @@
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm7, kRGBAToU
     sub        edi, edx             // stride from u to v
 
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -2181,10 +1909,10 @@
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -2194,39 +1922,92 @@
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
+    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
+    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
+    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
+    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
+    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
 #ifdef HAS_I422TOARGBROW_AVX2
-
-static const lvec8 kUVToB_AVX = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-static const lvec8 kUVToR_AVX = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-static const lvec8 kUVToG_AVX = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-static const lvec16 kYToRgb_AVX = {
-  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
-};
-static const lvec16 kYSub16_AVX = {
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-};
-static const lvec16 kUVBiasB_AVX = {
-  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
-};
-static const lvec16 kUVBiasG_AVX = {
-  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
-};
-static const lvec16 kUVBiasR_AVX = {
-  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
-};
-
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_AVX2(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2242,63 +2023,332 @@
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-    vpxor      ymm4, ymm4, ymm4
 
-    align      4
  convertloop:
-    vmovq      xmm0, qword ptr [esi]          //  U
-    vmovq      xmm1, qword ptr [esi + edi]    //  V
-    lea        esi,  [esi + 8]
-    vpunpcklbw ymm0, ymm0, ymm1               // UV
-    vpermq     ymm0, ymm0, 0xd8
-    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
-    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
-    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
-    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
-    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
-    vpsubw     ymm1, ymm1, kUVBiasG_AVX
-    vpsubw     ymm0, ymm0, kUVBiasR_AVX
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
 
-    // Step 2: Find Y contribution to 16 R,G,B values
-    vmovdqu    xmm3, [eax]                  // NOLINT
-    lea        eax, [eax + 16]
-    vpermq     ymm3, ymm3, 0xd8
-    vpunpcklbw ymm3, ymm3, ymm4
-    vpsubsw    ymm3, ymm3, kYSub16_AVX
-    vpmullw    ymm3, ymm3, kYToRgb_AVX
-    vpaddsw    ymm2, ymm2, ymm3           // B += Y
-    vpaddsw    ymm1, ymm1, ymm3           // G += Y
-    vpaddsw    ymm0, ymm0, ymm3           // R += Y
-    vpsraw     ymm2, ymm2, 6
-    vpsraw     ymm1, ymm1, 6
-    vpsraw     ymm0, ymm0, 6
-    vpackuswb  ymm2, ymm2, ymm2           // B
-    vpackuswb  ymm1, ymm1, ymm1           // G
-    vpackuswb  ymm0, ymm0, ymm0           // R
-
-    // Step 3: Weave into ARGB
-    vpunpcklbw ymm2, ymm2, ymm1           // BG
-    vpermq     ymm2, ymm2, 0xd8
-    vpunpcklbw ymm0, ymm0, ymm5           // RA
-    vpermq     ymm0, ymm0, 0xd8
-    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
-    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
-    lea        edx,  [edx + 64]
     sub        ecx, 16
     jg         convertloop
-    vzeroupper
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
 #endif  // HAS_I422TOARGBROW_AVX2
 
-#ifdef HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_J422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvJConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV411_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYvuConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    vpunpcklbw ymm1, ymm1, ymm0           // GB
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm2           // AR
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
+    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm2
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    vpunpcklbw ymm1, ymm1, ymm2           // GR
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm0           // AB
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
+    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I422TOABGRROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToABGRRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    vpunpcklbw ymm1, ymm2, ymm1           // RG
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm0, ymm5           // BA
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
 // Read 8 UV from 444.
@@ -2327,7 +2377,7 @@
     __asm lea        esi,  [esi + 2]                                           \
     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
   }
 
 // Read 4 UV from NV12, upsample to 8 UV.
@@ -2338,22 +2388,25 @@
   }
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm {                                                       \
+#define YUVTORGB(YuvConstants) __asm {                                         \
     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
+    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
+    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
+    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm psubw      xmm2, xmm3                                                \
     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
     __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm punpcklbw  xmm3, xmm3                                                \
+    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
@@ -2365,37 +2418,133 @@
     __asm packuswb   xmm2, xmm2           /* R */                              \
   }
 
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm {                                                       \
-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
   }
 
-// 8 pixels, dest aligned 16.
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    /* Step 3: Weave into BGRA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm {                                                      \
+    /* Step 3: Weave into ABGR */                                              \
+    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
+    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
+    __asm movdqa     xmm1, xmm2                                                \
+    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm2                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    /* Step 3: Weave into RGBA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB24 */                                                \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RAW values.
+#define STORERAW __asm {                                                       \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RAW */                                                  \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB565 */                                               \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -2411,22 +2560,12 @@
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV444
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2436,9 +2575,9 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
@@ -2453,27 +2592,14 @@
     mov        edx, [esp + 8 + 16]  // rgb24
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
     movdqa     xmm6, kShuffleMaskARGBToRGB24
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERGB24
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
-    movq       qword ptr [edx], xmm0  // First 8 bytes
-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
-    lea        edx,  [edx + 24]
     sub        ecx, 8
     jg         convertloop
 
@@ -2483,9 +2609,9 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
+__declspec(naked)
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -2500,27 +2626,14 @@
     mov        edx, [esp + 8 + 16]  // raw
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     movdqa     xmm5, kShuffleMaskARGBToRAW_0
     movdqa     xmm6, kShuffleMaskARGBToRAW
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERAW
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
-    movq       qword ptr [edx], xmm0  // First 8 bytes
-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
-    lea        edx,  [edx + 24]
     sub        ecx, 8
     jg         convertloop
 
@@ -2530,9 +2643,9 @@
   }
 }
 
-// 8 pixels, dest unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
@@ -2547,7 +2660,6 @@
     mov        edx, [esp + 8 + 16]  // rgb565
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
     psrld      xmm5, 27
     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
@@ -2556,45 +2668,12 @@
     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
     pslld      xmm7, 11
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERGB565
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-
-    // Step 3b: RRGB -> RGB565
-    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
-    movdqa     xmm2, xmm0    // G
-    pslld      xmm0, 8       // R
-    psrld      xmm3, 3       // B
-    psrld      xmm2, 5       // G
-    psrad      xmm0, 16      // R
-    pand       xmm3, xmm5    // B
-    pand       xmm2, xmm6    // G
-    pand       xmm0, xmm7    // R
-    por        xmm3, xmm2    // BG
-    por        xmm0, xmm3    // BGR
-    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
-    movdqa     xmm2, xmm1    // G
-    pslld      xmm1, 8       // R
-    psrld      xmm3, 3       // B
-    psrld      xmm2, 5       // G
-    psrad      xmm1, 16      // R
-    pand       xmm3, xmm5    // B
-    pand       xmm2, xmm6    // G
-    pand       xmm1, xmm7    // R
-    por        xmm3, xmm2    // BG
-    por        xmm1, xmm3    // BGR
-    packssdw   xmm0, xmm1
     sub        ecx, 8
-    movdqu     [edx], xmm0   // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        edi
@@ -2603,9 +2682,9 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -2621,22 +2700,12 @@
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2646,10 +2715,44 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvJConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -2665,23 +2768,13 @@
     mov        edx, [esp + 12 + 16]  // argb
     mov        ecx, [esp + 12 + 20]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
 
-    align      4
  convertloop:
     READYUV411  // modifies EBX
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2692,9 +2785,9 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
@@ -2706,22 +2799,12 @@
     mov        edx, [esp + 4 + 12]  // argb
     mov        ecx, [esp + 4 + 16]  // width
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READNV12
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2730,9 +2813,9 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* dst_argb,
@@ -2740,196 +2823,16 @@
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READNV12
-    YVUTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV444
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ecx, [esp + 12 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV411  // modifies EBX
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
     mov        edx, [esp + 4 + 12]  // argb
     mov        ecx, [esp + 4 + 16]  // width
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READNV12
-    YUVTORGB
+    YUVTORGB(kYvuConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2938,45 +2841,7 @@
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READNV12
-    YVUTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -2991,23 +2856,12 @@
     mov        edx, [esp + 8 + 16]  // bgra
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
 
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3017,48 +2871,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_bgra,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // bgra
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -3074,22 +2887,12 @@
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREABGR
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3099,48 +2902,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_abgr,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // abgr
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -3155,64 +2917,12 @@
     mov        edx, [esp + 8 + 16]  // rgba
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERGBA
 
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_rgba,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgba
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3224,33 +2934,33 @@
 
 #endif  // HAS_I422TOARGBROW_SSSE3
 
-#ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked) __declspec(align(16))
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) {
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
   __asm {
-    pxor       xmm5, xmm5
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
-    pslld      xmm4, 24
-    mov        eax, 0x00100010
-    movd       xmm3, eax
-    pshufd     xmm3, xmm3, 0
-    mov        eax, 0x004a004a       // 74
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
     mov        eax, [esp + 4]       // Y
     mov        edx, [esp + 8]       // rgb
     mov        ecx, [esp + 12]      // width
 
-    align      4
  convertloop:
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm5           // 0.Y
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
-    pmullw     xmm0, xmm2
     psrlw      xmm0, 6
     packuswb   xmm0, xmm0           // G
 
@@ -3261,16 +2971,66 @@
     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
-
     ret
   }
 }
-#endif  // HAS_YTOARGBROW_SSE2
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
@@ -3278,22 +3038,21 @@
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, kShuffleMirror
-    lea       eax, [eax - 16]
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax + ecx]
+    movdqu    xmm0, [eax - 16 + ecx]
     pshufb    xmm0, xmm5
-    sub       ecx, 16
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -3301,29 +3060,21 @@
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec8 kShuffleMirror_AVX2 = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    vmovdqa   ymm5, kShuffleMirror_AVX2
-    lea       eax, [eax - 32]
+    vbroadcastf128 ymm5, kShuffleMirror
 
-    align      4
  convertloop:
-    vmovdqu   ymm0, [eax + ecx]
+    vmovdqu   ymm0, [eax - 32 + ecx]
     vpshufb   ymm0, ymm0, ymm5
     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
-    sub       ecx, 32
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
+    sub       ecx, 32
     jg        convertloop
     vzeroupper
     ret
@@ -3332,19 +3083,15 @@
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16]
 
-    align      4
  convertloop:
-    movdqu    xmm0, [eax + ecx]
+    movdqu    xmm0, [eax - 16 + ecx]
     movdqa    xmm1, xmm0        // swap bytes
     psllw     xmm0, 8
     psrlw     xmm1, 8
@@ -3352,9 +3099,9 @@
     pshuflw   xmm0, xmm0, 0x1b  // swap words
     pshufhw   xmm0, xmm0, 0x1b
     pshufd    xmm0, xmm0, 0x4e  // swap qwords
-    sub       ecx, 16
     movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -3367,7 +3114,7 @@
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                        int width) {
   __asm {
@@ -3380,15 +3127,14 @@
     lea       eax, [eax + ecx * 2 - 16]
     sub       edi, edx
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]
+    movdqu    xmm0, [eax]
     lea       eax, [eax - 16]
     pshufb    xmm0, xmm1
-    sub       ecx, 8
     movlpd    qword ptr [edx], xmm0
     movhpd    qword ptr [edx + edi], xmm0
     lea       edx, [edx + 8]
+    sub       ecx, 8
     jg        convertloop
 
     pop       edi
@@ -3397,34 +3143,27 @@
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
-__declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-    movdqa    xmm5, kARGBShuffleMirror
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]
+    movdqu    xmm0, [eax]
     lea       eax, [eax - 16]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [edx], xmm0
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 4
     jg        convertloop
     ret
   }
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
@@ -3432,21 +3171,19 @@
   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 32]
-    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
+    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
 
-    align      4
  convertloop:
-    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
-    sub       ecx, 8
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
+    sub       ecx, 8
     jg        convertloop
     vzeroupper
     ret
@@ -3455,7 +3192,7 @@
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
@@ -3467,44 +3204,6 @@
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqa     [edx], xmm0
-    movdqa     [edx + edi], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -3527,10 +3226,11 @@
     ret
   }
 }
+
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
@@ -3542,7 +3242,6 @@
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3569,7 +3268,7 @@
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
@@ -3580,37 +3279,6 @@
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]      // read 16 U's
-    movdqa     xmm1, [eax + edx]  // and 16 V's
-    lea        eax,  [eax + 16]
-    movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
-    sub        edx, eax
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]      // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
@@ -3631,7 +3299,7 @@
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   __asm {
@@ -3642,17 +3310,16 @@
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]           // read 32 U's
     vmovdqu    ymm1, [eax + edx]     // and 32 V's
     lea        eax,  [eax + 32]
     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
-    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
-    vmovdqu    [edi], ymm1
-    vmovdqu    [edi + 32], ymm2
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
     lea        edi, [edi + 64]
     sub        ecx, 32
     jg         convertloop
@@ -3666,20 +3333,19 @@
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
 
-    align      4
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 32
     jg         convertloop
@@ -3688,8 +3354,33 @@
 }
 #endif  // HAS_COPYROW_SSE2
 
-// Unaligned Multiple of 1.
-__declspec(naked) __declspec(align(16))
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
@@ -3704,27 +3395,9 @@
   }
 }
 
-#ifdef HAS_COPYROW_X86
-__declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
-  __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep movsd
-    mov        edi, edx
-    mov        esi, eax
-    ret
-  }
-}
-#endif  // HAS_COPYROW_X86
-
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
@@ -3735,21 +3408,20 @@
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
     psrld      xmm1, 8
 
-    align      4
   convertloop:
-    movdqa     xmm2, [eax]
-    movdqa     xmm3, [eax + 16]
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
-    movdqa     xmm4, [edx]
-    movdqa     xmm5, [edx + 16]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
     pand       xmm2, xmm0
     pand       xmm3, xmm0
     pand       xmm4, xmm1
     pand       xmm5, xmm1
     por        xmm2, xmm4
     por        xmm3, xmm5
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
@@ -3761,7 +3433,7 @@
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
@@ -3770,7 +3442,6 @@
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
-    align      4
   convertloop:
     vmovdqu    ymm1, [eax]
     vmovdqu    ymm2, [eax + 32]
@@ -3791,7 +3462,7 @@
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
@@ -3802,23 +3473,22 @@
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
     psrld      xmm1, 8
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [eax]  // 8 Y's
     lea        eax, [eax + 8]
     punpcklbw  xmm2, xmm2
     punpckhwd  xmm3, xmm2
     punpcklwd  xmm2, xmm2
-    movdqa     xmm4, [edx]
-    movdqa     xmm5, [edx + 16]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
     pand       xmm2, xmm0
     pand       xmm3, xmm0
     pand       xmm4, xmm1
     pand       xmm5, xmm1
     por        xmm2, xmm4
     por        xmm3, xmm5
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
@@ -3830,7 +3500,7 @@
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov        eax, [esp + 4]   // src
@@ -3839,7 +3509,6 @@
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
-    align      4
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
     vpmovzxbd  ymm2, qword ptr [eax + 8]
@@ -3861,13 +3530,16 @@
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void SetRow_X86(uint8* dst, uint32 v32, int count) {
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
   __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
     mov        edx, edi
     mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
@@ -3876,40 +3548,37 @@
   }
 }
 
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-__declspec(naked) __declspec(align(16))
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   __asm {
-    push       esi
-    push       edi
-    push       ebp
-    mov        edi, [esp + 12 + 4]   // dst
-    mov        eax, [esp + 12 + 8]   // v32
-    mov        ebp, [esp + 12 + 12]  // width
-    mov        edx, [esp + 12 + 16]  // dst_stride
-    mov        esi, [esp + 12 + 20]  // height
-    lea        ecx, [ebp * 4]
-    sub        edx, ecx             // stride - width * 4
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
 
-    align      4
-  convertloop:
-    mov        ecx, ebp
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
     rep stosd
-    add        edi, edx
-    sub        esi, 1
-    jg         convertloop
-
-    pop        ebp
-    pop        edi
-    pop        esi
+    mov        edi, edx
     ret
   }
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
@@ -3919,7 +3588,6 @@
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3928,16 +3596,16 @@
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1   // mutates.
     vpermq     ymm0, ymm0, 0xd8
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
     vzeroupper
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -3952,7 +3620,6 @@
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3982,7 +3649,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -3995,7 +3662,6 @@
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4022,7 +3688,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_AVX2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
@@ -4030,7 +3696,6 @@
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4039,16 +3704,16 @@
     vpsrlw     ymm1, ymm1, 8
     vpackuswb  ymm0, ymm0, ymm1   // mutates.
     vpermq     ymm0, ymm0, 0xd8
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
-    ret
     vzeroupper
+    ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -4063,7 +3728,6 @@
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4093,7 +3757,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -4106,7 +3770,6 @@
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4135,7 +3798,7 @@
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
@@ -4145,23 +3808,22 @@
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     pand       xmm0, xmm5   // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -4176,114 +3838,6 @@
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4312,9 +3866,9 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
@@ -4325,7 +3879,6 @@
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4349,7 +3902,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
@@ -4357,23 +3910,22 @@
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      4
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     psrlw      xmm0, 8    // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -4388,112 +3940,6 @@
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-
-    align      4
-  convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4522,9 +3968,9 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
@@ -4535,7 +3981,6 @@
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4562,7 +4007,7 @@
 
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
   __asm {
@@ -4579,43 +4024,8 @@
     psllw      xmm5, 8
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
-
-    sub        ecx, 1
-    je         convertloop1     // only 1 pixel?
-    jl         convertloop1b
-
-    // 1 pixel loop until destination pointer is aligned.
-  alignloop1:
-    test       edx, 15          // aligned?
-    je         alignloop1b
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
-    pshuflw    xmm3, xmm3, 0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jge        alignloop1
-
-  alignloop1b:
-    add        ecx, 1 - 4
-    jl         convertloop4b
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
 
     // 4 pixel loop.
   convertloop4:
@@ -4639,9 +4049,9 @@
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        convertloop4
 
   convertloop4b:
@@ -4670,9 +4080,9 @@
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
   convertloop1b:
@@ -4696,7 +4106,7 @@
 //    pshufb     xmm3, kShuffleAlpha // alpha
 // Blend 8 pixels at a time.
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   __asm {
@@ -4713,75 +4123,11 @@
     psllw      xmm5, 8
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
-
-    sub        ecx, 1
-    je         convertloop1     // only 1 pixel?
-    jl         convertloop1b
-
-    // 1 pixel loop until destination pointer is aligned.
-  alignloop1:
-    test       edx, 15          // aligned?
-    je         alignloop1b
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
-    lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jge        alignloop1
-
-  alignloop1b:
-    add        ecx, 1 - 4
-    jl         convertloop4b
-
-    test       eax, 15          // unaligned?
-    jne        convertuloop4
-    test       esi, 15          // unaligned?
-    jne        convertuloop4
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
 
     // 4 pixel loop.
   convertloop4:
-    movdqa     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jge        convertloop4
-    jmp        convertloop4b
-
-    // 4 pixel unaligned loop.
-  convertuloop4:
     movdqu     xmm3, [eax]      // src argb
     lea        eax, [eax + 16]
     movdqa     xmm0, xmm3       // src argb
@@ -4800,10 +4146,10 @@
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    jge        convertuloop4
+    sub        ecx, 4
+    jge        convertloop4
 
   convertloop4b:
     add        ecx, 4 - 1
@@ -4829,9 +4175,9 @@
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
   convertloop1b:
@@ -4843,8 +4189,7 @@
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
@@ -4855,19 +4200,18 @@
     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
     psrld      xmm5, 8
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     punpcklbw  xmm0, xmm0       // first 2
     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
     pshuflw    xmm2, xmm2, 0FFh
     pmulhuw    xmm0, xmm2       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     punpckhbw  xmm1, xmm1       // next 2 pixels
     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
     pshuflw    xmm2, xmm2, 0FFh
     pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // alphas
+    movdqu     xmm2, [eax]      // alphas
     lea        eax, [eax + 16]
     psrlw      xmm0, 8
     pand       xmm2, xmm4
@@ -4875,9 +4219,9 @@
     packuswb   xmm0, xmm1
     pand       xmm0, xmm5       // keep original alphas
     por        xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -4894,7 +4238,7 @@
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
@@ -4905,7 +4249,6 @@
     movdqa     xmm4, kShuffleAlpha0
     movdqa     xmm5, kShuffleAlpha1
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]      // read 4 pixels
     pshufb     xmm0, xmm4       // isolate first 2 alphas
@@ -4924,9 +4267,9 @@
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     por        xmm0, xmm2       // copy original alpha
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -4936,24 +4279,20 @@
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const ulvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm4, kShuffleAlpha_AVX2
+    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
-    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
@@ -4967,9 +4306,9 @@
     vpsrlw     ymm1, ymm1, 8
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
     vpor       ymm0, ymm0, ymm6  // copy original alpha
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     vzeroupper
@@ -4980,8 +4319,7 @@
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
@@ -4991,7 +4329,6 @@
     mov        edx, [esp + 8 + 8]   // dst_argb
     mov        ecx, [esp + 8 + 12]  // width
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
@@ -5017,9 +4354,9 @@
     lea        eax, [eax + 16]
 
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
     pop        edi
     pop        esi
@@ -5030,14 +4367,13 @@
 
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
 };
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
@@ -5045,9 +4381,8 @@
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
+    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
 
-    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
@@ -5062,9 +4397,9 @@
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     vzeroupper
@@ -5072,7 +4407,7 @@
   }
 }
 #else  // USE_GATHER
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   __asm {
@@ -5081,12 +4416,11 @@
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
+    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
 
     push       esi
     push       edi
 
-    align      4
  convertloop:
     // replace VPGATHER
     movzx      esi, byte ptr [eax + 3]                 // alpha0
@@ -5124,9 +4458,9 @@
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     pop        edi
@@ -5140,7 +4474,7 @@
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -5149,18 +4483,17 @@
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 G bytes
-    movdqa     xmm2, [eax]  // A
-    movdqa     xmm3, [eax + 16]
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
@@ -5172,10 +4505,10 @@
     movdqa     xmm1, xmm0
     punpcklwd  xmm0, xmm3   // GGGA first 4
     punpckhwd  xmm1, xmm3   // GGGA next 4
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5200,7 +4533,7 @@
 };
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   __asm {
     mov        eax, [esp + 4]   /* dst_argb */
@@ -5209,32 +4542,31 @@
     movdqa     xmm3, kARGBToSepiaG
     movdqa     xmm4, kARGBToSepiaR
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
     pmaddubsw  xmm0, xmm2
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 B values
-    movdqa     xmm5, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 G values
     punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 R values
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
@@ -5243,10 +4575,10 @@
     movdqa     xmm1, xmm0   // Weave BG, RA together
     punpcklwd  xmm0, xmm5   // BGRA first 4
     punpckhwd  xmm1, xmm5   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5258,7 +4590,7 @@
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                               const int8* matrix_argb, int width) {
   __asm {
@@ -5272,14 +4604,13 @@
     pshufd     xmm5, xmm5, 0xff
     mov        ecx, [esp + 16]  /* width */
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm0, xmm2
     pmaddubsw  xmm7, xmm2
-    movdqa     xmm6, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
     phaddsw    xmm0, xmm7   // B
@@ -5289,13 +4620,13 @@
     packuswb   xmm0, xmm0   // 8 B values
     packuswb   xmm6, xmm6   // 8 G values
     punpcklbw  xmm0, xmm6   // 8 BG values
-    movdqa     xmm1, [eax]  // R
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
     phaddsw    xmm1, xmm7   // R
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
     phaddsw    xmm6, xmm7   // A
@@ -5307,11 +4638,11 @@
     movdqa     xmm6, xmm0   // Weave BG, RA together
     punpcklwd  xmm0, xmm1   // BGRA first 4
     punpckhwd  xmm6, xmm1   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm6
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5320,8 +4651,7 @@
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   __asm {
@@ -5340,25 +4670,24 @@
     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
     pslld      xmm6, 24
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     punpcklbw  xmm0, xmm5   // first 2 pixels
     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
-    movdqa     xmm1, [eax]  // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     punpckhbw  xmm1, xmm5   // next 2 pixels
     pmulhuw    xmm1, xmm2
     pmullw     xmm0, xmm3   // * interval_size
-    movdqa     xmm7, [eax]  // read 4 pixels
+    movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
     pand       xmm7, xmm6   // mask alpha
     paddw      xmm0, xmm4   // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
-    sub        ecx, 4
-    movdqa     [eax], xmm0
+    movdqu     [eax], xmm0
     lea        eax, [eax + 16]
+    sub        ecx, 4
     jg         convertloop
     ret
   }
@@ -5367,8 +4696,7 @@
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   __asm {
@@ -5379,9 +4707,8 @@
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0       // first 2
@@ -5391,9 +4718,9 @@
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -5403,7 +4730,7 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
@@ -5414,7 +4741,6 @@
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
@@ -5429,9 +4755,9 @@
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        esi
@@ -5443,7 +4769,7 @@
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
@@ -5456,16 +4782,15 @@
     sub        ecx, 4
     jl         convertloop49
 
-    align      4
  convertloop4:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        convertloop4
 
  convertloop49:
@@ -5478,9 +4803,9 @@
     movd       xmm1, [esi]        // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
  convertloop19:
@@ -5492,7 +4817,7 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
@@ -5502,16 +4827,15 @@
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        esi
@@ -5522,7 +4846,7 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
@@ -5533,7 +4857,6 @@
     mov        ecx, [esp + 4 + 16]  // width
     vpxor      ymm5, ymm5, ymm5     // constant 0
 
-    align      4
  convertloop:
     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5560,7 +4883,7 @@
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   __asm {
@@ -5570,7 +4893,6 @@
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5590,7 +4912,7 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   __asm {
@@ -5600,7 +4922,6 @@
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5623,7 +4944,7 @@
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   __asm {
@@ -5639,7 +4960,6 @@
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
@@ -5663,9 +4983,9 @@
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
     lea        eax, [eax + 8]
+    sub        ecx, 8
     jg         convertloop
 
     pop        edi
@@ -5680,7 +5000,7 @@
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   __asm {
@@ -5693,7 +5013,6 @@
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
@@ -5717,9 +5036,9 @@
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
     lea        eax, [eax + 8]
+    sub        ecx, 8
     jg         convertloop
 
     pop        esi
@@ -5734,7 +5053,7 @@
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) {
   __asm {
@@ -5747,10 +5066,9 @@
     pcmpeqb    xmm5, xmm5           // alpha 255
     pslld      xmm5, 24             // 0xff000000
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
     movdqa     xmm2, xmm0             // GG
@@ -5766,12 +5084,12 @@
     punpckhwd  xmm0, xmm0             // Last 4
     por        xmm3, xmm5             // GGGA
     por        xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm1
-    movdqa     [edx + 16], xmm2
-    movdqa     [edx + 32], xmm3
-    movdqa     [edx + 48], xmm0
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5782,7 +5100,7 @@
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_y, int width) {
   __asm {
@@ -5793,15 +5111,14 @@
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5816,7 +5133,7 @@
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   __asm {
@@ -5828,10 +5145,9 @@
     sub        esi, eax
     pcmpeqb    xmm5, xmm5           // alpha 255
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
@@ -5847,12 +5163,12 @@
     movdqa     xmm7, xmm1             // YSXA
     punpcklwd  xmm7, xmm0             // Next 4
     punpckhwd  xmm1, xmm0             // Last 4
-    sub        ecx, 16
-    movdqa     [edx], xmm6
-    movdqa     [edx + 16], xmm4
-    movdqa     [edx + 32], xmm7
-    movdqa     [edx + 48], xmm1
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5873,8 +5189,7 @@
 // area is the number of pixels in the area being averaged.
 // dst points to pixel to store result to.
 // count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
+// Does 4 pixels at a time.
 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
                                     int width, int area, uint8* dst,
                                     int count) {
@@ -5904,13 +5219,12 @@
     packssdw   xmm5, xmm5           // 16 bit shorts
 
     // 4 pixel loop small blocks.
-    align      4
   s4:
     // top left
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
 
     // - top right
     psubd      xmm0, [eax + edx * 4]
@@ -5947,13 +5261,12 @@
     jmp        l4b
 
     // 4 pixel loop
-    align      4
   l4:
     // top left
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
 
     // - top right
     psubd      xmm0, [eax + edx * 4]
@@ -6000,9 +5313,8 @@
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
     lea        eax, [eax + 16]
     psubd      xmm0, [esi]
@@ -6041,7 +5353,6 @@
     jne        l4b
 
     // 4 pixel loop
-    align      4
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -6058,26 +5369,26 @@
     punpckhwd  xmm5, xmm1
 
     paddd      xmm0, xmm2
-    movdqa     xmm2, [esi]  // previous row above.
+    movdqu     xmm2, [esi]  // previous row above.
     paddd      xmm2, xmm0
 
     paddd      xmm0, xmm3
-    movdqa     xmm3, [esi + 16]
+    movdqu     xmm3, [esi + 16]
     paddd      xmm3, xmm0
 
     paddd      xmm0, xmm4
-    movdqa     xmm4, [esi + 32]
+    movdqu     xmm4, [esi + 32]
     paddd      xmm4, xmm0
 
     paddd      xmm0, xmm5
-    movdqa     xmm5, [esi + 48]
+    movdqu     xmm5, [esi + 48]
     lea        esi, [esi + 64]
     paddd      xmm5, xmm0
 
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
-    movdqa     [edx + 32], xmm4
-    movdqa     [edx + 48], xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
 
     lea        edx, [edx + 64]
     sub        ecx, 4
@@ -6088,7 +5399,6 @@
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
@@ -6110,7 +5420,7 @@
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width) {
@@ -6143,7 +5453,6 @@
     addps      xmm4, xmm4    // dudv *= 4
 
     // 4 pixel loop
-    align      4
   l4:
     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
@@ -6165,9 +5474,9 @@
     movd       xmm0, [eax + edi]  // read pixel 3
     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
     addps      xmm3, xmm4    // x, y += dx, dy next 2
-    sub        ecx, 4
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        l4
 
   l4b:
@@ -6175,7 +5484,6 @@
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     cvttps2dq  xmm0, xmm2    // x, y float to int
     packssdw   xmm0, xmm0    // x, y as shorts
@@ -6183,9 +5491,9 @@
     addps      xmm2, xmm7    // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        l1
   l1b:
     pop        edi
@@ -6196,11 +5504,11 @@
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   __asm {
     push       esi
     push       edi
@@ -6230,7 +5538,6 @@
     vpxor      ymm0, ymm0, ymm0
     vpermd     ymm5, ymm0, ymm5
 
-    align      4
   xloop:
     vmovdqu    ymm0, [esi]
     vmovdqu    ymm2, [esi + edx]
@@ -6241,51 +5548,49 @@
     vpsrlw     ymm0, ymm0, 7
     vpsrlw     ymm1, ymm1, 7
     vpackuswb  ymm0, ymm0, ymm1  // unmutates
-    sub        ecx, 32
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
+    sub        ecx, 32
     jg         xloop
     jmp        xloop99
 
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    sub        ecx, 32
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop25
-    jmp        xloop99
+   // Blend 25 / 75.
+ xloop25:
+   vmovdqu    ymm0, [esi]
+   vmovdqu    ymm1, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop25
+   jmp        xloop99
 
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    sub        ecx, 32
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop50
-    jmp        xloop99
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
 
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    vmovdqu    ymm0, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi]
-    sub        ecx, 32
-    vmovdqu     [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop75
-    jmp        xloop99
+   // Blend 75 / 25.
+ xloop75:
+   vmovdqu    ymm1, [esi]
+   vmovdqu    ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop75
+   jmp        xloop99
 
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    rep movsb
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
 
   xloop99:
     pop        edi
@@ -6296,9 +5601,8 @@
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
-#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
                           int source_y_fraction) {
@@ -6330,11 +5634,10 @@
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
-    align      4
   xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
     pmaddubsw  xmm0, xmm5
@@ -6342,57 +5645,53 @@
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
+    movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      4
   xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
+    movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop25
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      4
   xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
+    movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop50
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      4
   xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
+    movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop75
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      4
   xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop100
 
   xloop99:
@@ -6401,11 +5700,10 @@
     ret
   }
 }
-#endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) {
@@ -6436,225 +5734,6 @@
     punpcklqdq xmm5, xmm5
     pxor       xmm4, xmm4
 
-    align      4
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
-    paddw      xmm3, xmm3
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride, int dst_width,
-                                    int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-
-    movd       xmm0, eax  // high fraction 0..127
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    align      4
-  xloop:
-    movdqu     xmm0, [esi]
-    movdqu     xmm2, [esi + edx]
-    movdqu     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqu     xmm1, [esi]
-    movdqu     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqu     xmm0, [esi]
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride, int dst_width,
-                                   int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    cmp        eax, 64
-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-    cmp        eax, 192
-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    psrlw      xmm5, 1
-    punpcklwd  xmm5, xmm5
-    punpckldq  xmm5, xmm5
-    punpcklqdq xmm5, xmm5
-    pxor       xmm4, xmm4
-
-    align      4
   xloop:
     movdqu     xmm0, [esi]  // row0
     movdqu     xmm2, [esi + edx]  // row1
@@ -6673,57 +5752,53 @@
     paddw      xmm0, xmm2  // sum rows
     paddw      xmm1, xmm3
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      4
   xloop25:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop25
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      4
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop50
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      4
   xloop75:
     movdqu     xmm1, [esi]
     movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop75
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      4
   xloop100:
     movdqu     xmm0, [esi]
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop100
 
   xloop99:
@@ -6734,170 +5809,34 @@
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-__declspec(naked) __declspec(align(16))
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    pavgb      xmm0, [eax + edx]
-    sub        ecx, 16
-    movdqa     [eax + edi], xmm0
-    lea        eax,  [eax + 16]
-    jg         convertloop
-    pop        edi
-    ret
-  }
-}
-
-#ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      4
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vpavgb     ymm0, ymm0, [eax + edx]
-    sub        ecx, 32
-    vmovdqu    [eax + edi], ymm0
-    lea        eax,  [eax + 32]
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm5, [esp + 12]  // selector
-    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm5, xmm5, 0
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    punpckldq  xmm0, xmm1
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-    ret
-  }
-}
-
-// Specialized ARGB to Bayer that just isolates G channel.
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
-                           uint32 selector, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
-                                 // selector
-    mov        ecx, [esp + 16]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
-    psrld      xmm5, 24
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    psrld      xmm0, 8  // Move green to bottom.
-    psrld      xmm1, 8
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packssdw   xmm0, xmm1
-    packuswb   xmm0, xmm1
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-    ret
-  }
-}
-
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
-    movdqa     xmm5, [ecx]
+    movdqu     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
 
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    jg         wloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
-    movdqa     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // pix
-
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm5
     pshufb     xmm1, xmm5
-    sub        ecx, 8
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         wloop
     ret
   }
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
@@ -6907,17 +5846,16 @@
     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
     mov        ecx, [esp + 16]    // pix
 
-    align      4
   wloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax, [eax + 64]
     vpshufb    ymm0, ymm0, ymm5
     vpshufb    ymm1, ymm1, ymm5
-    sub        ecx, 16
     vmovdqu    [edx], ymm0
     vmovdqu    [edx + 32], ymm1
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         wloop
 
     vzeroupper
@@ -6926,7 +5864,7 @@
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
@@ -6968,7 +5906,6 @@
     jg         shuf_any1
     jmp        shuf99
 
-    align      4
   shuf_0123:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -6980,13 +5917,12 @@
     pshufhw    xmm1, xmm1, 01Bh
     pshuflw    xmm1, xmm1, 01Bh
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_0123
     jmp        shuf99
 
-    align      4
   shuf_0321:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -6998,13 +5934,12 @@
     pshufhw    xmm1, xmm1, 039h
     pshuflw    xmm1, xmm1, 039h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_0321
     jmp        shuf99
 
-    align      4
   shuf_2103:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -7016,13 +5951,12 @@
     pshufhw    xmm1, xmm1, 093h
     pshuflw    xmm1, xmm1, 093h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_2103
     jmp        shuf99
 
-    align      4
   shuf_3012:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -7034,9 +5968,9 @@
     pshufhw    xmm1, xmm1, 0C6h
     pshuflw    xmm1, xmm1, 0C6h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_3012
 
   shuf99:
@@ -7052,7 +5986,7 @@
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -7067,7 +6001,6 @@
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -7090,7 +6023,7 @@
   }
 }
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -7105,7 +6038,6 @@
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -7129,7 +6061,7 @@
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
@@ -7142,7 +6074,6 @@
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
     // 2 pixel loop.
-    align      4
  convertloop:
 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
@@ -7178,9 +6109,9 @@
     cvttps2dq  xmm4, xmm4
     packuswb   xmm0, xmm4
     packuswb   xmm0, xmm0
-    sub        ecx, 2
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 2
     jg         convertloop
     pop        esi
     ret
@@ -7189,7 +6120,7 @@
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
@@ -7204,7 +6135,6 @@
     mov        ecx, [esp + 16]  /* width */
 
     // 2 pixel loop.
-    align      4
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
@@ -7218,9 +6148,9 @@
     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
-    sub         ecx, 2
     vmovq       qword ptr [edx], xmm0
     lea         edx, [edx + 8]
+    sub         ecx, 2
     jg          convertloop
     vzeroupper
     ret
@@ -7230,7 +6160,7 @@
 
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
                            int width) {
   __asm {
@@ -7240,7 +6170,6 @@
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
-    align      4
   convertloop:
     movzx      edx, byte ptr [eax]
     lea        eax, [eax + 4]
@@ -7265,7 +6194,7 @@
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
   __asm {
     push       esi
@@ -7274,7 +6203,6 @@
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
-    align      4
   convertloop:
     movzx      edx, byte ptr [eax]
     lea        eax, [eax + 4]
@@ -7297,7 +6225,7 @@
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                  int width,
                                  const uint8* luma, uint32 lumacoeff) {
@@ -7316,7 +6244,6 @@
     pxor       xmm5, xmm5
 
     // 4 pixel loop.
-    align      4
   convertloop:
     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
     pmaddubsw  xmm0, xmm3
@@ -7383,9 +6310,9 @@
     movzx      edx, byte ptr [eax + 15]  // copy alpha.
     mov        byte ptr [edi + 15], dl
 
-    sub        ecx, 4
     lea        eax, [eax + 16]
     lea        edi, [edi + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        edi
@@ -7396,7 +6323,7 @@
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
 #endif  // defined(_M_X64)
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/scale.cc b/libvpx/third_party/libyuv/source/scale.cc
index 5b33b5f..0a01304 100644
--- a/libvpx/third_party/libyuv/source/scale.cc
+++ b/libvpx/third_party/libyuv/source/scale.cc

@@ -23,9 +23,6 @@
 extern "C" {
 #endif
 
-// Remove this macro if OVERREAD is safe.
-#define AVOID_OVERREAD 1
-
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
@@ -44,9 +41,8 @@
   int y;
   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
-        ScaleRowDown2Box_C);
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
@@ -54,23 +50,42 @@
   }
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
   }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
-        ScaleRowDown2Box_Unaligned_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
+        ScaleRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 16)) {
       ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
           (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
           ScaleRowDown2Box_SSE2);
     }
   }
-#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -112,21 +127,15 @@
     ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
         ScaleRowDown2_16_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ?
-        ScaleRowDown2_Unaligned_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
-        ScaleRowDown2Box_Unaligned_16_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-          ScaleRowDown2Box_16_SSE2);
-    }
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
   }
-#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -165,16 +174,33 @@
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
   }
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+    }
   }
-#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -212,14 +238,14 @@
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
         ScaleRowDown4_16_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
         ScaleRowDown4_16_SSE2;
   }
-#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -260,25 +286,42 @@
     ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
   }
 #if defined(HAS_SCALEROWDOWN34_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
     } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
     }
   }
 #endif
@@ -351,8 +394,7 @@
   }
 #endif
 #if defined(HAS_SCALEROWDOWN34_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
       ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
@@ -435,28 +477,47 @@
     ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
     ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
   }
+
 #if defined(HAS_SCALEROWDOWN38_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
-    } else {
+    }
+    if (dst_width % 6 == 0 && filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -522,9 +583,9 @@
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
@@ -533,7 +594,8 @@
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -570,65 +632,7 @@
   }
 }
 
-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
-                              ptrdiff_t src_stride, const uint8* src_ptr) {
-  uint32 sum = 0u;
-  int y;
-  assert(iboxwidth > 0);
-  assert(iboxheight > 0);
-  for (y = 0; y < iboxheight; ++y) {
-    int x;
-    for (x = 0; x < iboxwidth; ++x) {
-      sum += src_ptr[x];
-    }
-    src_ptr += src_stride;
-  }
-  return sum;
-}
-
-static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
-                                 ptrdiff_t src_stride, const uint16* src_ptr) {
-  uint32 sum = 0u;
-  int y;
-  assert(iboxwidth > 0);
-  assert(iboxheight > 0);
-  for (y = 0; y < iboxheight; ++y) {
-    int x;
-    for (x = 0; x < iboxwidth; ++x) {
-      sum += src_ptr[x];
-    }
-    src_ptr += src_stride;
-  }
-  return sum;
-}
-
-static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
-                               int x, int dx, ptrdiff_t src_stride,
-                               const uint8* src_ptr, uint8* dst_ptr) {
-  int i;
-  int boxwidth;
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = (x >> 16) - ix;
-    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
-        (boxwidth * boxheight);
-  }
-}
-
-static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
-                                  int x, int dx, ptrdiff_t src_stride,
-                                  const uint16* src_ptr, uint16* dst_ptr) {
-  int i;
-  int boxwidth;
-  for (i = 0; i < dst_width; ++i) {
-    int ix = x >> 16;
-    x += dx;
-    boxwidth = (x >> 16) - ix;
-    *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
-        (boxwidth * boxheight);
-  }
-}
+#define MIN1(x) ((x) < 1 ? 1 : (x))
 
 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   uint32 sum = 0u;
@@ -654,15 +658,15 @@
                             const uint16* src_ptr, uint8* dst_ptr) {
   int i;
   int scaletbl[2];
-  int minboxwidth = (dx >> 16);
+  int minboxwidth = dx >> 16;
   int* scaleptr = scaletbl - minboxwidth;
   int boxwidth;
-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
   for (i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
-    boxwidth = (x >> 16) - ix;
+    boxwidth = MIN1((x >> 16) - ix);
     *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
   }
 }
@@ -671,25 +675,36 @@
                                const uint32* src_ptr, uint16* dst_ptr) {
   int i;
   int scaletbl[2];
-  int minboxwidth = (dx >> 16);
+  int minboxwidth = dx >> 16;
   int* scaleptr = scaletbl - minboxwidth;
   int boxwidth;
-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
   for (i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
-    boxwidth = (x >> 16) - ix;
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-        scaleptr[boxwidth] >> 16;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
   }
 }
 
 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
-  int boxwidth = (dx >> 16);
+  int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
+  x >>= 16;
   for (i = 0; i < dst_width; ++i) {
     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
     x += boxwidth;
@@ -698,7 +713,7 @@
 
 static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
                                const uint32* src_ptr, uint16* dst_ptr) {
-  int boxwidth = (dx >> 16);
+  int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
   for (i = 0; i < dst_width; ++i) {
@@ -718,7 +733,7 @@
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
                           const uint8* src_ptr, uint8* dst_ptr) {
-  int j;
+  int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -728,10 +743,40 @@
   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
              &x, &y, &dx, &dy);
   src_width = Abs(src_width);
-  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
-  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
-    uint8* dst = dst_ptr;
-    int j;
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
@@ -740,46 +785,13 @@
       if (y > max_y) {
         y = max_y;
       }
-      boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow_C(dst_width, boxheight,
-                         x, dx, src_stride,
-                         src, dst);
-      dst += dst_stride;
-    }
-    return;
-  }
-  {
-    // Allocate a row buffer of uint16.
-    align_buffer_64(row16, src_width * 2);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
-    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
-        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
-
-#if defined(HAS_SCALEADDROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-#ifdef AVOID_OVERREAD
-        IS_ALIGNED(src_width, 16) &&
-#endif
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-      ScaleAddRows = ScaleAddRows_SSE2;
-    }
-#endif
-
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > (src_height << 16)) {
-        y = (src_height << 16);
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
       }
-      boxheight = (y >> 16) - iy;
-      ScaleAddRows(src, src_stride, (uint16*)(row16),
-                 src_width, boxheight);
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
-                 dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row16);
@@ -790,7 +802,7 @@
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint16* src_ptr, uint16* dst_ptr) {
-  int j;
+  int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -800,10 +812,21 @@
   ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
              &x, &y, &dx, &dy);
   src_width = Abs(src_width);
-  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
-  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
-    uint16* dst = dst_ptr;
-    int j;
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
@@ -812,46 +835,13 @@
       if (y > max_y) {
         y = max_y;
       }
-      boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow_16_C(dst_width, boxheight,
-                            x, dx, src_stride,
-                            src, dst);
-      dst += dst_stride;
-    }
-    return;
-  }
-  {
-    // Allocate a row buffer of uint32.
-    align_buffer_64(row32, src_width * 4);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
-        uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
-
-#if defined(HAS_SCALEADDROWS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
-#ifdef AVOID_OVERREAD
-        IS_ALIGNED(src_width, 16) &&
-#endif
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-      ScaleAddRows = ScaleAddRows_16_SSE2;
-    }
-#endif
-
-    for (j = 0; j < dst_height; ++j) {
-      int boxheight;
-      int iy = y >> 16;
-      const uint16* src = src_ptr + iy * src_stride;
-      y += dy;
-      if (y > (src_height << 16)) {
-        y = (src_height << 16);
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
       }
-      boxheight = (y >> 16) - iy;
-      ScaleAddRows(src, src_stride, (uint32*)(row32),
-                 src_width, boxheight);
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
-                 dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row32);
@@ -886,29 +876,23 @@
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(src_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -916,7 +900,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
@@ -924,7 +908,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(src_width, 4)) {
       InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -938,6 +922,14 @@
     ScaleFilterCols = ScaleFilterCols_SSSE3;
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
   if (y > max_y) {
     y = max_y;
   }
@@ -988,29 +980,23 @@
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(src_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -1018,7 +1004,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
@@ -1026,7 +1012,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
     if (IS_ALIGNED(src_width, 4)) {
       InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1080,36 +1066,30 @@
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
   void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-       int dst_width, int x, int dx) =
-       filtering ? ScaleFilterCols_C : ScaleCols_C;
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
              &x, &y, &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -1117,7 +1097,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
@@ -1125,7 +1105,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -1141,12 +1121,18 @@
     ScaleFilterCols = ScaleFilterCols_SSSE3;
   }
 #endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleFilterCols = ScaleColsUp2_SSE2;
     }
 #endif
@@ -1160,7 +1146,7 @@
     const uint8* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 15) & ~15;
+    const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
     uint8* rowptr = row;
@@ -1219,36 +1205,30 @@
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_16_C;
   void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-       int dst_width, int x, int dx) =
-       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
              &x, &y, &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(dst_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -1256,7 +1236,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
@@ -1264,7 +1244,7 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1283,9 +1263,7 @@
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_16_C;
 #if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleFilterCols = ScaleColsUp2_16_SSE2;
     }
 #endif
@@ -1299,7 +1277,7 @@
     const uint16* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 15) & ~15;
+    const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 4);
 
     uint16* rowptr = (uint16*)row;
@@ -1366,17 +1344,14 @@
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleCols = ScaleColsUp2_SSE2;
     }
 #endif
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1401,9 +1376,7 @@
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleCols = ScaleColsUp2_16_C;
 #if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleCols = ScaleColsUp2_16_SSE2;
     }
 #endif
@@ -1428,8 +1401,7 @@
                 enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
-                                filtering);
+                                dst_width, dst_height, filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1445,9 +1417,9 @@
     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
     return;
   }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
     ScalePlaneVertical(src_height,
                        dst_width, dst_height,
                        src_stride, dst_stride, src, dst,
@@ -1478,7 +1450,7 @@
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
@@ -1512,8 +1484,7 @@
                   enum FilterMode filtering) {
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
-                                filtering);
+                                dst_width, dst_height, filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1606,6 +1577,7 @@
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
@@ -1637,6 +1609,7 @@
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }

diff --git a/libvpx/third_party/libyuv/source/scale_any.cc b/libvpx/third_party/libyuv/source/scale_any.cc
new file mode 100644
index 0000000..2f6a2c8
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/scale_any.cc

@@ -0,0 +1,200 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSE2
+SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
+      2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSE2
+SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+

diff --git a/libvpx/third_party/libyuv/source/scale_argb.cc b/libvpx/third_party/libyuv/source/scale_argb.cc
index e339cd7..40a2d1a 100644
--- a/libvpx/third_party/libyuv/source/scale_argb.cc
+++ b/libvpx/third_party/libyuv/source/scale_argb.cc

@@ -53,18 +53,27 @@
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-        ScaleARGBRowDown2Box_SSE2);
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
   }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
-        ScaleARGBRowDown2_NEON;
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
   }
 #endif
 
@@ -88,7 +97,7 @@
                               int x, int dx, int y, int dy) {
   int j;
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@@ -98,17 +107,22 @@
   assert(dx == 65536 * 4);  // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
-  }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
   }
 #endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
     ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@@ -139,16 +153,23 @@
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-        ScaleARGBRowDownEven_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
   }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 4)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-        ScaleARGBRowDownEven_NEON;
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
   }
 #endif
 
@@ -190,29 +211,23 @@
   src_argb += xl * 4;
   x -= (int)(xl << 16);
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(clip_src_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -220,15 +235,15 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(clip_src_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(clip_src_width, 4)) {
@@ -241,6 +256,14 @@
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of ARGB.
   {
@@ -286,29 +309,23 @@
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -316,15 +333,15 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
@@ -338,17 +355,31 @@
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
@@ -363,7 +394,7 @@
     const uint8* src = src_argb + yi * src_stride;
 
     // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 15) & ~15;
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
     uint8* rowptr = row;
@@ -427,18 +458,15 @@
                         uint8* rgb_buf,
                         int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(src_width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -446,7 +474,7 @@
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(src_width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -467,29 +495,23 @@
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -497,15 +519,15 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
     InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
@@ -523,17 +545,31 @@
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
@@ -551,7 +587,7 @@
   const uint8* src_row_v = src_v + uv_yi * src_stride_v;
 
   // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
 
   // Allocate 1 row of ARGB for source conversion.
@@ -637,12 +673,18 @@
     ScaleARGBCols = ScaleARGBCols_SSE2;
   }
 #endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
@@ -776,6 +818,7 @@
   if (!src_argb || src_width == 0 || src_height == 0 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0 ||
       clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
@@ -794,6 +837,7 @@
               int dst_width, int dst_height,
               enum FilterMode filtering) {
   if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }

diff --git a/libvpx/third_party/libyuv/source/scale_common.cc b/libvpx/third_party/libyuv/source/scale_common.cc
index e4b2acc..1711f3d 100644
--- a/libvpx/third_party/libyuv/source/scale_common.cc
+++ b/libvpx/third_party/libyuv/source/scale_common.cc

@@ -621,39 +621,31 @@
   }
 }
 
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
   int x;
   assert(src_width > 0);
-  assert(src_height > 0);
-  for (x = 0; x < src_width; ++x) {
-    const uint8* s = src_ptr + x;
-    unsigned int sum = 0u;
-    int y;
-    for (y = 0; y < src_height; ++y) {
-      sum += s[0];
-      s += src_stride;
-    }
-    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
-    dst_ptr[x] = sum < 65535u ? sum : 65535u;
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
   }
 }
 
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                       uint32* dst_ptr, int src_width, int src_height) {
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
   int x;
   assert(src_width > 0);
-  assert(src_height > 0);
-  for (x = 0; x < src_width; ++x) {
-    const uint16* s = src_ptr + x;
-    unsigned int sum = 0u;
-    int y;
-    for (y = 0; y < src_height; ++y) {
-      sum += s[0];
-      s += src_stride;
-    }
-    // No risk of overflow here now
-    dst_ptr[x] = sum;
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
   }
 }
 
@@ -885,31 +877,23 @@
   assert(dst_height > 0);
   src_argb += (x >> 16) * bpp;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width_bytes, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -917,15 +901,15 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
@@ -967,31 +951,23 @@
   assert(dst_height > 0);
   src_argb += (x >> 16) * wpp;
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(dst_width_bytes, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -999,15 +975,15 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
@@ -1046,10 +1022,6 @@
     if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
       filtering = kFilterBilinear;
     }
-    // If scaling to larger, switch from Box to Bilinear.
-    if (dst_width >= src_width || dst_height >= src_height) {
-      filtering = kFilterBilinear;
-    }
   }
   if (filtering == kFilterBilinear) {
     if (src_height == 1) {

diff --git a/libvpx/third_party/libyuv/source/scale_posix.cc b/libvpx/third_party/libyuv/source/scale_gcc.cc
similarity index 78%
rename from libvpx/third_party/libyuv/source/scale_posix.cc
rename to libvpx/third_party/libyuv/source/scale_gcc.cc
index 352e667..8a6ac54 100644
--- a/libvpx/third_party/libyuv/source/scale_posix.cc
+++ b/libvpx/third_party/libyuv/source/scale_gcc.cc

@@ -101,24 +101,20 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -130,8 +126,8 @@
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
@@ -142,18 +138,14 @@
     "pavgw     %%xmm2,%%xmm0                   \n"
     "pavgw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 
@@ -165,116 +157,9 @@
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    BUNDLEALIGN
     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
@@ -296,13 +181,8 @@
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -315,8 +195,8 @@
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
@@ -330,11 +210,7 @@
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 
@@ -348,18 +224,16 @@
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
-    MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
-    MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
-    MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm4,%%xmm2                   \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
@@ -388,13 +262,8 @@
     "+r"(dst_width),   // %2
     "+r"(stridex3)     // %3
   : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
   );
 }
 
@@ -412,8 +281,8 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm2,%%xmm1                   \n"
     "palignr   $0x8,%%xmm0,%%xmm1              \n"
@@ -429,11 +298,7 @@
   : "+r"(src_ptr),   // %0
     "+r"(dst_ptr),   // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -461,8 +326,8 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm2,%%xmm6                   \n"
     "pmaddubsw %%xmm5,%%xmm6                   \n"
@@ -479,9 +344,8 @@
     "psrlw     $0x2,%%xmm6                     \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm4,%%xmm6                   \n"
@@ -498,13 +362,8 @@
     "+r"(dst_width)  // %2
   : "r"((intptr_t)(src_stride)),  // %3
     "m"(kMadd21)     // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -533,8 +392,8 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
     "pavgb     %%xmm6,%%xmm7                   \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm2,%%xmm6                   \n"
@@ -553,8 +412,8 @@
     "psrlw     $0x2,%%xmm6                     \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm6,%%xmm7                   \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
@@ -572,13 +431,8 @@
       "+r"(dst_width)  // %2
     : "r"((intptr_t)(src_stride)),  // %3
       "m"(kMadd21)     // %4
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -590,8 +444,8 @@
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
@@ -607,10 +461,7 @@
     "+r"(dst_width)  // %2
   : "m"(kShuf38a),   // %3
     "m"(kShuf38b)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
   );
 }
 
@@ -631,9 +482,10 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pshufb    %%xmm2,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm6                   \n"
@@ -643,23 +495,18 @@
     "paddusw   %%xmm0,%%xmm1                   \n"
     "pmulhuw   %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "sub       $0x6,%2                         \n"
     "movd      %%xmm1," MEMACCESS(1) "         \n"
     "psrlq     $0x10,%%xmm1                    \n"
     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
     "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
   : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -679,8 +526,8 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
     "movhlps   %%xmm0,%%xmm1                   \n"
     "movhlps   %%xmm6,%%xmm7                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
@@ -689,7 +536,7 @@
     "punpcklbw %%xmm5,%%xmm7                   \n"
     "paddusw   %%xmm6,%%xmm0                   \n"
     "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movhlps   %%xmm6,%%xmm7                   \n"
     "punpcklbw %%xmm5,%%xmm6                   \n"
@@ -711,64 +558,53 @@
     "paddusw   %%xmm7,%%xmm6                   \n"
     "pmulhuw   %%xmm4,%%xmm6                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
-    "sub       $0x6,%2                         \n"
     "movd      %%xmm6," MEMACCESS(1) "         \n"
     "psrlq     $0x10,%%xmm6                    \n"
     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
     "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
+// Reads 16xN bytes and produces 16 shorts at a time.
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint16* dst_ptr, int src_width, int src_height) {
   int tmp_height = 0;
   intptr_t tmp_src = 0;
   asm volatile (
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    "sub       $0x1,%5                         \n"
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "mov       %0,%3                           \n"
-    "add       %6,%0                           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "mov       %5,%2                           \n"
-    "test      %2,%2                           \n"
-    "je        3f                              \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
-    "add       %6,%0                           \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
+    "add       %6,%3                           \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
     "punpckhbw %%xmm4,%%xmm3                   \n"
     "paddusw   %%xmm2,%%xmm0                   \n"
     "paddusw   %%xmm3,%%xmm1                   \n"
     "sub       $0x1,%2                         \n"
-    "jg        2b                              \n"
+    "jg        1b                              \n"
 
-    LABELALIGN
-  "3:                                          \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x10,3) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
     "sub       $0x10,%4                        \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
@@ -778,10 +614,7 @@
     "+r"(src_width),   // %4
     "+rm"(src_height)  // %5
   : "rm"((intptr_t)(src_stride))  // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 
@@ -813,7 +646,6 @@
     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
     "movd      %k2,%%xmm0                      \n"
     "psrlw     $0x9,%%xmm1                     \n"
-    BUNDLEALIGN
     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
     "movd      %k2,%%xmm4                      \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
@@ -853,13 +685,8 @@
     "+rm"(dst_width)   // %5
   : "rm"(x),           // %6
     "rm"(dx)           // %7
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -870,25 +697,21 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
-    "sub       $0x20,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
     "jg        1b                              \n"
 
   : "+r"(dst_ptr),     // %0
     "+r"(src_ptr),     // %1
     "+r"(dst_width)    // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -898,22 +721,18 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -923,25 +742,21 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -951,11 +766,10 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
@@ -963,29 +777,23 @@
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+                               int src_stepx, uint8* dst_argb, int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12 = 0;
   asm volatile (
@@ -996,29 +804,22 @@
     "movd      " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
     "punpckldq %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
     "punpckldq %%xmm3,%%xmm2                   \n"
     "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),      // %0
     "+r"(src_stepx_x4),  // %1
     "+r"(dst_argb),      // %2
     "+r"(dst_width),     // %3
     "+r"(src_stepx_x12)  // %4
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
@@ -1040,11 +841,9 @@
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
     "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
@@ -1055,9 +854,9 @@
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),       // %0
     "+r"(src_stepx_x4),   // %1
@@ -1065,14 +864,8 @@
     "+rm"(dst_width),     // %3
     "+r"(src_stepx_x12),  // %4
     "+r"(row1)            // %5
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
@@ -1111,15 +904,14 @@
     "pextrw    $0x3,%%xmm2,%k1                 \n"
     "punpckldq %%xmm4,%%xmm1                   \n"
     "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "sub       $0x4,%4                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
     "jge       40b                             \n"
 
   "49:                                         \n"
     "test      $0x2,%4                         \n"
     "je        29f                             \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
     "pextrw    $0x5,%%xmm2,%k0                 \n"
@@ -1139,13 +931,8 @@
     "+r"(dst_width)    // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 
@@ -1156,28 +943,22 @@
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpckldq %%xmm0,%%xmm0                   \n"
     "punpckhdq %%xmm1,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
 
   : "+r"(dst_argb),    // %0
     "+r"(src_argb),    // %1
     "+r"(dst_width)    // %2
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 
@@ -1225,7 +1006,6 @@
     "paddd     %%xmm3,%%xmm2                   \n"
     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
     "psrlw     $0x9,%%xmm1                     \n"
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
     "pshufb    %%xmm5,%%xmm1                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -1245,7 +1025,6 @@
     "add       $0x1,%2                         \n"
     "jl        99f                             \n"
     "psrlw     $0x9,%%xmm2                     \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
     "pshufb    %%xmm5,%%xmm2                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -1264,13 +1043,8 @@
     "+r"(x1)           // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 

diff --git a/libvpx/third_party/libyuv/source/scale_neon.cc b/libvpx/third_party/libyuv/source/scale_neon.cc
index 1b8a5ba..7825878 100644
--- a/libvpx/third_party/libyuv/source/scale_neon.cc
+++ b/libvpx/third_party/libyuv/source/scale_neon.cc

@@ -16,7 +16,8 @@
 #endif
 
 // This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 // NEON downscalers with interpolation.
 // Provided by Fritz Koenig
@@ -42,6 +43,30 @@
   );
 }
 
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
@@ -516,6 +541,112 @@
   );
 }
 
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vshrn.s32  d18, q11, #16                  \n"
+    "vshrn.s32  d19, q12, #16                  \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                           const uint8* src_ptr, ptrdiff_t src_stride,
@@ -639,6 +770,35 @@
   );
 }
 
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width) {
   asm volatile (
@@ -756,7 +916,120 @@
   );
 }
 
-#endif  // __ARM_NEON__
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp = 0;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/scale_neon64.cc b/libvpx/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..1d55193
--- /dev/null
+++ b/libvpx/third_party/libyuv/source/scale_neon64.cc

@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "shrn      v6.4h, v16.4s, #16              \n"
+    "shrn2     v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64 = 0;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp64),            // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/libvpx/third_party/libyuv/source/scale_win.cc b/libvpx/third_party/libyuv/source/scale_win.cc
index 840b973..c3896eb 100644
--- a/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/libvpx/third_party/libyuv/source/scale_win.cc

@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -16,7 +17,8 @@
 #endif
 
 // This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
 
 // Offsets for source bytes 0 to 9
 static uvec8 kShuf0 =
@@ -93,8 +95,7 @@
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
@@ -103,17 +104,16 @@
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     psrlw      xmm0, 8               // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     ret
@@ -121,8 +121,7 @@
 }
 
 // Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
   __asm {
@@ -133,10 +132,9 @@
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
 
     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
@@ -149,9 +147,9 @@
     pavgw      xmm1, xmm3
     packuswb   xmm0, xmm1
 
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     ret
@@ -159,8 +157,7 @@
 }
 
 // Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
@@ -172,120 +169,6 @@
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      4
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -305,9 +188,9 @@
     pavgw      xmm1, xmm3
     packuswb   xmm0, xmm1
 
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     pop        esi
@@ -315,9 +198,116 @@
   }
 }
 
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
 // Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   __asm {
@@ -329,19 +319,18 @@
     psrld      xmm5, 24
     pslld      xmm5, 16
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     pand       xmm0, xmm5
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     psrlw      xmm0, 8
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 8
     jg         wloop
 
     ret
@@ -349,8 +338,7 @@
 }
 
 // Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
@@ -364,18 +352,17 @@
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2            // average rows
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm4, [eax + edi]
+    movdqu     xmm5, [eax + edi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
@@ -398,9 +385,9 @@
     pavgw      xmm0, xmm2
     packuswb   xmm0, xmm0
 
-    sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 8
     jg         wloop
 
     pop        edi
@@ -409,13 +396,102 @@
   }
 }
 
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
+    vpsrlw      ymm7, ymm7, 8
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpavgb      ymm2, ymm2, [eax + edi]
+    vpavgb      ymm3, ymm3, [eax + edi + 32]
+    lea         eax, [eax + 64]
+    vpavgb      ymm0, ymm0, ymm2
+    vpavgb      ymm1, ymm1, ymm3
+
+    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
+    vpand       ymm3, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 8
+    vpsrlw      ymm1, ymm1, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpavgw      ymm1, ymm1, ymm3
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
+    vpsrlw      ymm0, ymm0, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
 // Point samples 32 pixels to 24 pixels.
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
@@ -427,10 +503,9 @@
     movdqa     xmm4, kShuf1
     movdqa     xmm5, kShuf2
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm1
     palignr    xmm1, xmm0, 8
@@ -463,8 +538,7 @@
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
@@ -481,10 +555,9 @@
     movdqa     xmm6, kMadd11
     movdqa     xmm7, kRound34
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
     pmaddubsw  xmm0, xmm5
@@ -501,8 +574,8 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
@@ -511,9 +584,9 @@
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx + 24]
+    sub        ecx, 24
     jg         wloop
 
     pop        esi
@@ -522,8 +595,7 @@
 }
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
@@ -540,10 +612,9 @@
     movdqa     xmm6, kMadd11
     movdqa     xmm7, kRound34
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -562,8 +633,8 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -573,9 +644,9 @@
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx+24]
+    sub        ecx, 24
     jg         wloop
 
     pop        esi
@@ -586,7 +657,7 @@
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
   __asm {
@@ -597,20 +668,19 @@
     movdqa     xmm4, kShuf38a
     movdqa     xmm5, kShuf38b
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    sub        ecx, 12
     movq       qword ptr [edx], xmm0  // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
+    sub        ecx, 12
     jg         xloop
 
     ret
@@ -618,7 +688,7 @@
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
@@ -633,10 +703,9 @@
     movdqa     xmm4, kScaleAc33
     pxor       xmm5, xmm5
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqa     xmm6, [eax + esi]
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
     punpcklbw  xmm0, xmm5
@@ -645,7 +714,7 @@
     punpcklbw  xmm7, xmm5
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
-    movdqa     xmm6, [eax + esi * 2]
+    movdqu     xmm6, [eax + esi * 2]
     lea        eax, [eax + 16]
     movhlps    xmm7, xmm6
     punpcklbw  xmm6, xmm5
@@ -671,11 +740,11 @@
     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    sub        ecx, 6
     movd       [edx], xmm6           // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
+    sub        ecx, 6
     jg         xloop
 
     pop        esi
@@ -684,7 +753,7 @@
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8* dst_ptr, int dst_width) {
@@ -699,11 +768,11 @@
     movdqa     xmm4, kShufAb2
     movdqa     xmm5, kScaleAb2
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // average 2 rows into xmm0
-    pavgb      xmm0, [eax + esi]
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
 
     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
@@ -716,11 +785,11 @@
     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    sub        ecx, 6
     movd       [edx], xmm1           // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
+    sub        ecx, 6
     jg         xloop
 
     pop        esi
@@ -728,79 +797,68 @@
   }
 }
 
-// Reads 16xN bytes and produces 16 shorts at a time.
-// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
-__declspec(naked) __declspec(align(16))
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width,
-                       int src_height) {
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
   __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        esi, [esp + 16 + 4]   // src_ptr
-    mov        edx, [esp + 16 + 8]   // src_stride
-    mov        edi, [esp + 16 + 12]  // dst_ptr
-    mov        ecx, [esp + 16 + 16]  // dst_width
-    mov        ebx, [esp + 16 + 20]  // height
-    pxor       xmm4, xmm4
-    dec        ebx
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
 
-    align      4
+  // sum rows
   xloop:
-    // first row
-    movdqa     xmm0, [esi]
-    lea        eax, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    lea        esi, [esi + 16]
-    mov        ebp, ebx
-    test       ebp, ebp
-    je         ydone
-
-    // sum remaining rows
-    align      4
-  yloop:
-    movdqa     xmm2, [eax]       // read 16 pixels
-    lea        eax, [eax + edx]  // advance to next row
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
     paddusw    xmm0, xmm2        // sum 16 words
     paddusw    xmm1, xmm3
-    sub        ebp, 1
-    jg         yloop
-
-    align      4
-  ydone:
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
     sub        ecx, 16
     jg         xloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
     ret
   }
 }
 
-// Bilinear column filtering. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-// TODO(fbarchard): Switch the following:
-//    xor        ebx, ebx
-//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
-// To
-//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
-// when drmemory bug fixed.
-// https://code.google.com/p/drmemory/issues/detail?id=1396
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
 
-__declspec(naked) __declspec(align(16))
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                            int dst_width, int x, int dx) {
   __asm {
@@ -828,7 +886,6 @@
     pextrw     edx, xmm2, 3         // get x1 integer. preroll
 
     // 2 Pixel loop.
-    align      4
   xloop2:
     movdqa     xmm1, xmm2           // x0, x1 fractions.
     paddd      xmm2, xmm3           // x += dx
@@ -851,7 +908,6 @@
     sub        ecx, 2               // 2 pixels
     jge        xloop2
 
-    align      4
  xloop29:
 
     add        ecx, 2 - 1
@@ -869,7 +925,6 @@
     movd       ebx, xmm0
     mov        [edi], bl
 
-    align      4
  xloop99:
 
     pop        edi
@@ -880,8 +935,7 @@
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                        int dst_width, int x, int dx) {
   __asm {
@@ -889,17 +943,16 @@
     mov        eax, [esp + 8]    // src_ptr
     mov        ecx, [esp + 12]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     lea        eax,  [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0
     punpckhbw  xmm1, xmm1
-    sub        ecx, 32
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         wloop
 
     ret
@@ -907,8 +960,7 @@
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                             ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) {
@@ -918,15 +970,14 @@
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     shufps     xmm0, xmm1, 0xdd
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     ret
@@ -934,8 +985,7 @@
 }
 
 // Blends 8x1 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   uint8* dst_argb, int dst_width) {
@@ -945,18 +995,17 @@
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     ret
@@ -964,8 +1013,7 @@
 }
 
 // Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                ptrdiff_t src_stride,
                                uint8* dst_argb, int dst_width) {
@@ -976,12 +1024,11 @@
     mov        edx, [esp + 4 + 12]   // dst_argb
     mov        ecx, [esp + 4 + 16]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
@@ -989,9 +1036,9 @@
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        esi
@@ -1000,8 +1047,7 @@
 }
 
 // Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width) {
@@ -1016,7 +1062,6 @@
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
-    align      4
   wloop:
     movd       xmm0, [eax]
     movd       xmm1, [eax + ebx]
@@ -1026,9 +1071,9 @@
     lea        eax,  [eax + ebx * 4]
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        edi
@@ -1038,8 +1083,7 @@
 }
 
 // Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
                                   int src_stepx,
@@ -1057,7 +1101,6 @@
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
-    align      4
   wloop:
     movq       xmm0, qword ptr [eax]  // row0 4 pairs
     movhps     xmm0, qword ptr [eax + ebx]
@@ -1075,9 +1118,9 @@
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        edi
@@ -1088,7 +1131,7 @@
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                         int dst_width, int x, int dx) {
   __asm {
@@ -1118,7 +1161,6 @@
     jl         xloop49
 
     // 4 Pixel loop.
-    align      4
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
@@ -1133,12 +1175,11 @@
     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
     punpckldq  xmm1, xmm4             // x2 x3
     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
-    sub        ecx, 4                 // 4 pixels
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
     jge        xloop4
 
-    align      4
  xloop49:
     test       ecx, 2
     je         xloop29
@@ -1159,7 +1200,6 @@
     // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
-    align      4
  xloop99:
 
     pop        esi
@@ -1182,7 +1222,7 @@
   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx) {
   __asm {
@@ -1209,7 +1249,6 @@
     pextrw     edx, xmm2, 3         // get x1 integer. preroll
 
     // 2 Pixel loop.
-    align      4
   xloop2:
     movdqa     xmm1, xmm2           // x0, x1 fractions.
     paddd      xmm2, xmm3           // x += dx
@@ -1229,7 +1268,6 @@
     sub        ecx, 2               // 2 pixels
     jge        xloop2
 
-    align      4
  xloop29:
 
     add        ecx, 2 - 1
@@ -1246,7 +1284,6 @@
     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
-    align      4
  xloop99:
 
     pop        edi
@@ -1256,8 +1293,7 @@
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx) {
   __asm {
@@ -1265,17 +1301,16 @@
     mov        eax, [esp + 8]    // src_argb
     mov        ecx, [esp + 12]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     lea        eax,  [eax + 16]
     movdqa     xmm1, xmm0
     punpckldq  xmm0, xmm0
     punpckhdq  xmm1, xmm1
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         wloop
 
     ret
@@ -1283,7 +1318,7 @@
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
@@ -1296,7 +1331,7 @@
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv1_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
@@ -1311,8 +1346,7 @@
     ret
   }
 }
-
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/third_party/libyuv/source/video_common.cc b/libvpx/third_party/libyuv/source/video_common.cc
index efbedf4..379a066 100644
--- a/libvpx/third_party/libyuv/source/video_common.cc
+++ b/libvpx/third_party/libyuv/source/video_common.cc

@@ -33,7 +33,7 @@
   {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
   {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
   {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
   {FOURCC_RGB3, FOURCC_RAW },
   {FOURCC_BGR3, FOURCC_24BG},
   {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB

diff --git a/libvpx/third_party/x86inc/README.libvpx b/libvpx/third_party/x86inc/README.libvpx
index 02cd9ab..fe5b076 100644
--- a/libvpx/third_party/x86inc/README.libvpx
+++ b/libvpx/third_party/x86inc/README.libvpx

@@ -1,5 +1,5 @@
 URL: http://git.videolan.org/?p=x264.git
-Version: 999b753ff0f4dc872077f4fa90d465e948cbe656
+Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
 License: ISC
 License File: LICENSE
 
@@ -8,4 +8,15 @@
 defines that help automatically allow assembly to work cross-platform.
 
 Local Modifications:
-Some modifications to allow PIC to work with x86inc.
+Get configuration from vpx_config.asm.
+Prefix functions with vpx by default.
+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
+  exist in libvpx.
+Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
+Catch all elf formats for 'hidden' status and SECTION notes.
+Avoid 'amdnop' when building with nasm.
+Set 'private_extern' visibility for macho targets.
+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
+Use .text with no alignment for aout
+Only use 'hidden' visibility with Chromium

diff --git a/libvpx/third_party/x86inc/x86inc.asm b/libvpx/third_party/x86inc/x86inc.asm
index 99453a9..77a58f2 100644
--- a/libvpx/third_party/x86inc/x86inc.asm
+++ b/libvpx/third_party/x86inc/x86inc.asm

@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2012 x264 project
+;* Copyright (C) 2005-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -36,11 +36,24 @@
 
 %include "vpx_config.asm"
 
-%define program_name vp9
+%ifndef private_prefix
+    %define private_prefix vpx
+%endif
 
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
 
-%define UNIX64 0
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
 %define WIN64  0
+%define UNIX64 0
 %if ARCH_X86_64
     %ifidn __OUTPUT_FORMAT__,win32
         %define WIN64  1
@@ -57,8 +70,6 @@
     %define mangle(x) x
 %elifidn __OUTPUT_FORMAT__,elf64
     %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,elf
-    %define mangle(x) x
 %elifidn __OUTPUT_FORMAT__,x64
     %define mangle(x) x
 %elifidn __OUTPUT_FORMAT__,win64
@@ -67,28 +78,22 @@
     %define mangle(x) _ %+ x
 %endif
 
-; FIXME: All of the 64bit asm functions that take a stride as an argument
-; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides,
-; and x264's strides are all positive), but is not guaranteed by the ABI.
-
-; Name of the .rodata section.
-; Kludge: Something on OS X fails to align .rodata even given an align attribute,
-; so use a different read-only section.
+; In some instances macho32 tables get misaligned when using .rodata.
+; When looking at the disassembly it appears that the offset is either
+; correct or consistently off by 90. Placing them in the .text section
+; works around the issue. It appears to be specific to the way libvpx
+; handles the tables.
 %macro SECTION_RODATA 0-1 16
-    %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=%1
-    %elifidn __OUTPUT_FORMAT__,macho
+    %ifidn __OUTPUT_FORMAT__,macho32
         SECTION .text align=%1
         fakegot:
     %elifidn __OUTPUT_FORMAT__,aout
-        section .text
+        SECTION .text
     %else
         SECTION .rodata align=%1
     %endif
 %endmacro
 
-; aout does not support align=
 %macro SECTION_TEXT 0-1 16
     %ifidn __OUTPUT_FORMAT__,aout
         SECTION .text
@@ -112,58 +117,58 @@
 %endif
 
 %if ABI_IS_32BIT
-  %if CONFIG_PIC=1
-  %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
-    %define WRT_PLT wrt ..plt
-    %macro GET_GOT 1
-      extern _GLOBAL_OFFSET_TABLE_
-      push %1
-      call %%get_got
-      %%sub_offset:
-      jmp %%exitGG
-      %%get_got:
-      mov %1, [esp]
-      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
-      ret
-      %%exitGG:
-      %undef GLOBAL
-      %define GLOBAL(x) x + %1 wrt ..gotoff
-      %undef RESTORE_GOT
-      %define RESTORE_GOT pop %1
-    %endmacro
-  %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
-    %macro GET_GOT 1
-      push %1
-      call %%get_got
-      %%get_got:
-      pop  %1
-      %undef GLOBAL
-      %define GLOBAL(x) x + %1 - %%get_got
-      %undef RESTORE_GOT
-      %define RESTORE_GOT pop %1
-    %endmacro
-  %endif
-  %endif
+    %if CONFIG_PIC=1
+        %ifidn __OUTPUT_FORMAT__,elf32
+            %define GET_GOT_SAVE_ARG 1
+            %define WRT_PLT wrt ..plt
+            %macro GET_GOT 1
+                extern _GLOBAL_OFFSET_TABLE_
+                push %1
+                call %%get_got
+                %%sub_offset:
+                jmp %%exitGG
+                %%get_got:
+                mov %1, [esp]
+                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+                ret
+                %%exitGG:
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 wrt ..gotoff
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %elifidn __OUTPUT_FORMAT__,macho32
+            %define GET_GOT_SAVE_ARG 1
+            %macro GET_GOT 1
+                push %1
+                call %%get_got
+                %%get_got:
+                pop  %1
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 - %%get_got
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %endif
+    %endif
 
-  %if ARCH_X86_64 == 0
-    %undef PIC
-  %endif
+    %if ARCH_X86_64 == 0
+        %undef PIC
+    %endif
 
 %else
-  %macro GET_GOT 1
-  %endmacro
-  %define GLOBAL(x) rel x
-  %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) rel x
+    %define WRT_PLT wrt ..plt
 
-  %if WIN64
-    %define PIC
-  %elifidn __OUTPUT_FORMAT__,macho64
-    %define PIC
-  %elif CONFIG_PIC
-    %define PIC
-  %endif
+    %if WIN64
+        %define PIC
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %define PIC
+    %elif CONFIG_PIC
+        %define PIC
+    %endif
 %endif
 
 %ifnmacro GET_GOT
@@ -172,10 +177,10 @@
     %define GLOBAL(x) x
 %endif
 %ifndef RESTORE_GOT
-%define RESTORE_GOT
+    %define RESTORE_GOT
 %endif
 %ifndef WRT_PLT
-%define WRT_PLT
+    %define WRT_PLT
 %endif
 
 %ifdef PIC
@@ -183,14 +188,6 @@
 %endif
 ; Done with PIC macros
 
-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
-%ifndef __NASM_VER__
-CPU amdnop
-%else
-%use smartalign
-ALIGNMODE k7
-%endif
-
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -200,12 +197,20 @@
 ; %1 = number of arguments. loads them from stack if needed.
 ; %2 = number of registers used. pushes callee-saved regs if needed.
 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = list of names to define to registers
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
 
 ; TODO Some functions can use some args directly from the stack. If they're the
 ; last args then you can just not declare them, but if they're in the middle
@@ -215,40 +220,43 @@
 ; Pops anything that was pushed by PROLOGUE, and returns.
 
 ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
 
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
 ; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
 ; rNm is the original location of arg N (a register or on the stack), dword
 ; rNmp is native size
 
-%macro DECLARE_REG 5-6
+%macro DECLARE_REG 2-3
     %define r%1q %2
-    %define r%1d %3
-    %define r%1w %4
-    %define r%1b %5
-    %if %0 == 5
-        %define r%1m  %3
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
         %define r%1mp %2
     %elif ARCH_X86_64 ; memory
-        %define r%1m [rsp + stack_offset + %6]
+        %define r%1m [rstk + stack_offset + %3]
         %define r%1mp qword r %+ %1 %+ m
     %else
-        %define r%1m [esp + stack_offset + %6]
+        %define r%1m [rstk + stack_offset + %3]
         %define r%1mp dword r %+ %1 %+ m
     %endif
     %define r%1  %2
 %endmacro
 
-%macro DECLARE_REG_SIZE 2
+%macro DECLARE_REG_SIZE 3
     %define r%1q r%1
     %define e%1q r%1
     %define r%1d e%1
     %define e%1d e%1
     %define r%1w %1
     %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
     %define r%1b %2
     %define e%1b %2
 %if ARCH_X86_64 == 0
@@ -256,13 +264,13 @@
 %endif
 %endmacro
 
-DECLARE_REG_SIZE ax, al
-DECLARE_REG_SIZE bx, bl
-DECLARE_REG_SIZE cx, cl
-DECLARE_REG_SIZE dx, dl
-DECLARE_REG_SIZE si, sil
-DECLARE_REG_SIZE di, dil
-DECLARE_REG_SIZE bp, bpl
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
 
 ; t# defines for when per-arch register allocation is more complex than just function arguments
 
@@ -280,6 +288,7 @@
         %define t%1q t%1 %+ q
         %define t%1d t%1 %+ d
         %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
         %define t%1b t%1 %+ b
         %rotate 1
     %endrep
@@ -295,12 +304,16 @@
 
 %macro PUSH 1
     push %1
-    %assign stack_offset stack_offset+gprsize
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
 %endmacro
 
 %macro POP 1
     pop %1
-    %assign stack_offset stack_offset-gprsize
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
 %endmacro
 
 %macro PUSH_IF_USED 1-*
@@ -332,14 +345,14 @@
 
 %macro SUB 2
     sub %1, %2
-    %ifidn %1, rsp
+    %ifidn %1, rstk
         %assign stack_offset stack_offset+(%2)
     %endif
 %endmacro
 
 %macro ADD 2
     add %1, %2
-    %ifidn %1, rsp
+    %ifidn %1, rstk
         %assign stack_offset stack_offset-(%2)
     %endif
 %endmacro
@@ -369,6 +382,7 @@
             CAT_UNDEF arg_name %+ %%i, q
             CAT_UNDEF arg_name %+ %%i, d
             CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
             CAT_UNDEF arg_name %+ %%i, b
             CAT_UNDEF arg_name %+ %%i, m
             CAT_UNDEF arg_name %+ %%i, mp
@@ -384,6 +398,7 @@
         %xdefine %1q r %+ %%i %+ q
         %xdefine %1d r %+ %%i %+ d
         %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
         %xdefine %1b r %+ %%i %+ b
         %xdefine %1m r %+ %%i %+ m
         %xdefine %1mp r %+ %%i %+ mp
@@ -395,155 +410,240 @@
     %assign n_arg_names %0
 %endmacro
 
-%if ARCH_X86_64
-%macro ALLOC_STACK 2  ; stack_size, num_regs
-  %assign %%stack_aligment ((mmsize + 15) & ~15)
-  %assign stack_size_padded %1
+%define required_stack_alignment ((mmsize + 15) & ~15)
 
-  %assign %%reg_num (%2 - 1)
-  %xdefine rsp_tmp r %+ %%reg_num
-  mov  rsp_tmp, rsp
-  sub  rsp, stack_size_padded
-  and  rsp, ~(%%stack_aligment - 1)
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
 %endmacro
 
-%macro RESTORE_STACK 0  ; reset rsp register
-  mov  rsp, rsp_tmp
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
+                %warning "Stack pointer will overwrite register argument"
+            %endif
+        %endif
+    %endif
 %endmacro
-%endif
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
 
 %if WIN64 ; Windows x64 ;=================================================
 
-DECLARE_REG 0,  rcx, ecx,  cx,   cl
-DECLARE_REG 1,  rdx, edx,  dx,   dl
-DECLARE_REG 2,  R8,  R8D,  R8W,  R8B
-DECLARE_REG 3,  R9,  R9D,  R9W,  R9B
-DECLARE_REG 4,  R10, R10D, R10W, R10B, 40
-DECLARE_REG 5,  R11, R11D, R11W, R11B, 48
-DECLARE_REG 6,  rax, eax,  ax,   al,   56
-DECLARE_REG 7,  rdi, edi,  di,   dil,  64
-DECLARE_REG 8,  rsi, esi,  si,   sil,  72
-DECLARE_REG 9,  rbx, ebx,  bx,   bl,   80
-DECLARE_REG 10, rbp, ebp,  bp,   bpl,  88
-DECLARE_REG 11, R12, R12D, R12W, R12B, 96
-DECLARE_REG 12, R13, R13D, R13W, R13B, 104
-DECLARE_REG 13, R14, R14D, R14W, R14B, 112
-DECLARE_REG 14, R15, R15D, R15W, R15B, 120
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
 
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
     PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
-    %if mmsize == 8
-        %assign xmm_regs_used 0
-    %else
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
         WIN64_SPILL_XMM %3
     %endif
     LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
 %endmacro
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
     ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 6
-        SUB rsp, (xmm_regs_used-6)*16+16
-        %assign %%i xmm_regs_used
-        %rep (xmm_regs_used-6)
-            %assign %%i %%i-1
-            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
-        %endrep
+    %if xmm_regs_used > 8
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
     %endif
+    WIN64_PUSH_XMM
 %endmacro
 
 %macro WIN64_RESTORE_XMM_INTERNAL 1
-    %if xmm_regs_used > 6
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
         %assign %%i xmm_regs_used
-        %rep (xmm_regs_used-6)
+        %rep xmm_regs_used-8
             %assign %%i %%i-1
-            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
         %endrep
-        add %1, (xmm_regs_used-6)*16+16
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
 
 %macro WIN64_RESTORE_XMM 1
     WIN64_RESTORE_XMM_INTERNAL %1
-    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign stack_offset (stack_offset-stack_size_padded)
     %assign xmm_regs_used 0
 %endmacro
 
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL rsp
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    ret
-%endmacro
-
-%macro REP_RET 0
-    %if regs_used > 7 || xmm_regs_used > 6
-        RET
-    %else
-        rep ret
-    %endif
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
 %endmacro
 
 %elif ARCH_X86_64 ; *nix x64 ;=============================================
 
-DECLARE_REG 0,  rdi, edi,  di,   dil
-DECLARE_REG 1,  rsi, esi,  si,   sil
-DECLARE_REG 2,  rdx, edx,  dx,   dl
-DECLARE_REG 3,  rcx, ecx,  cx,   cl
-DECLARE_REG 4,  R8,  R8D,  R8W,  R8B
-DECLARE_REG 5,  R9,  R9D,  R9W,  R9B
-DECLARE_REG 6,  rax, eax,  ax,   al,   8
-DECLARE_REG 7,  R10, R10D, R10W, R10B, 16
-DECLARE_REG 8,  R11, R11D, R11W, R11B, 24
-DECLARE_REG 9,  rbx, ebx,  bx,   bl,   32
-DECLARE_REG 10, rbp, ebp,  bp,   bpl,  40
-DECLARE_REG 11, R12, R12D, R12W, R12B, 48
-DECLARE_REG 12, R13, R13D, R13W, R13B, 56
-DECLARE_REG 13, R14, R14D, R14W, R14B, 64
-DECLARE_REG 14, R15, R15D, R15W, R15B, 72
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
 
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
     ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
     PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
     LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
 %macro RET 0
+%if stack_size_padded > 0
+%if required_stack_alignment > STACK_ALIGNMENT
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    ret
-%endmacro
-
-%macro REP_RET 0
-    %if regs_used > 9
-        RET
-    %else
-        rep ret
-    %endif
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
 %endmacro
 
 %else ; X86_32 ;==============================================================
 
-DECLARE_REG 0, eax, eax, ax, al,   4
-DECLARE_REG 1, ecx, ecx, cx, cl,   8
-DECLARE_REG 2, edx, edx, dx, dl,   12
-DECLARE_REG 3, ebx, ebx, bx, bl,   16
-DECLARE_REG 4, esi, esi, si, null, 20
-DECLARE_REG 5, edi, edi, di, null, 24
-DECLARE_REG 6, ebp, ebp, bp, null, 28
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
 %define rsp esp
 
 %macro DECLARE_ARG 1-*
     %rep %0
-        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
         %define r%1mp dword r%1m
         %rotate 1
     %endrep
@@ -551,29 +651,39 @@
 
 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
     %if regs_used > 7
         %assign regs_used 7
     %endif
-    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
     PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
     LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS %4
+    DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
 %macro RET 0
+%if stack_size_padded > 0
+%if required_stack_alignment > STACK_ALIGNMENT
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
     POP_IF_USED 6, 5, 4, 3
-    ret
-%endmacro
-
-%macro REP_RET 0
-    %if regs_used > 3
-        RET
-    %else
-        rep ret
-    %endif
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
@@ -583,8 +693,54 @@
 %endmacro
 %macro WIN64_RESTORE_XMM 1
 %endmacro
+%macro WIN64_PUSH_XMM 0
+%endmacro
 %endif
 
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
 ;=============================================================================
 ; arch-independent part
 ;=============================================================================
@@ -595,48 +751,69 @@
 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
 ; subsequent uses of the function name automatically refer to the mangled version.
 ; Appends cpuflags to the function name if cpuflags has been specified.
-%macro cglobal 1-2+ ; name, [PROLOGUE args]
-%if %0 == 1
-    cglobal_internal %1 %+ SUFFIX
-%else
-    cglobal_internal %1 %+ SUFFIX, %2
-%endif
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
 %endmacro
-%macro cglobal_internal 1-2+
-    %ifndef cglobaled_%1
-        %xdefine %1 mangle(program_name %+ _ %+ %1)
-        %xdefine %1.skip_prologue %1 %+ .skip_prologue
-        CAT_XDEFINE cglobaled_, %1, 1
-    %endif
-    %xdefine current_function %1
-    %ifdef CHROMIUM
-        %ifidn __OUTPUT_FORMAT__,elf
-            global %1:function hidden
-        %elifidn __OUTPUT_FORMAT__,elf32
-            global %1:function hidden
-        %elifidn __OUTPUT_FORMAT__,elf64
-            global %1:function hidden
-        %elifidn __OUTPUT_FORMAT__,macho32
-            global %1:private_extern
-        %elifidn __OUTPUT_FORMAT__,macho64
-            global %1:private_extern
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        ; libvpx explicitly sets visibility in shared object builds. Avoid
+        ; setting visibility to hidden as it may break builds that split
+        ; sources on e.g., directory boundaries.
+        %ifdef CHROMIUM
+            %xdefine %%VISIBILITY hidden
         %else
-            global %1
+            %xdefine %%VISIBILITY
         %endif
     %else
-        global %1
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %ifidn __OUTPUT_FORMAT__,elf32
+        global %2:function %%VISIBILITY
+    %elifidn __OUTPUT_FORMAT__,elf64
+        global %2:function %%VISIBILITY
+    %elifidn __OUTPUT_FORMAT__,macho32
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %else
+        global %2
     %endif
     align function_align
-    %1:
-    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
-    %assign stack_offset 0
-    %if %0 > 1
-        PROLOGUE %2
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
     %endif
 %endmacro
 
 %macro cextern 1
-    %xdefine %1 mangle(program_name %+ _ %+ %1)
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
@@ -648,17 +825,21 @@
     extern %1
 %endmacro
 
-%macro const 2+
-    %xdefine %1 mangle(program_name %+ _ %+ %1)
-    global %1
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %ifidn __OUTPUT_FORMAT__,elf32
+        global %1:data hidden
+    %elifidn __OUTPUT_FORMAT__,elf64
+        global %1:data hidden
+    %else
+        global %1
+    %endif
     %1: %2
 %endmacro
 
 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
 ; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%elifidn __OUTPUT_FORMAT__,elf32
+%ifidn __OUTPUT_FORMAT__,elf32
 SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %elifidn __OUTPUT_FORMAT__,elf64
 SECTION .note.GNU-stack noalloc noexec nowrite progbits
@@ -669,7 +850,7 @@
 %assign cpuflags_mmx      (1<<0)
 %assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
 %assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
@@ -680,51 +861,71 @@
 %assign cpuflags_avx      (1<<11)| cpuflags_sse42
 %assign cpuflags_xop      (1<<12)| cpuflags_avx
 %assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
 
 %assign cpuflags_cache32  (1<<16)
 %assign cpuflags_cache64  (1<<17)
 %assign cpuflags_slowctz  (1<<18)
 %assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_misalign (1<<20)
-%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<22)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
 
 %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
 
-; Takes up to 2 cpuflags from the above list.
+; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-2
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
     %if %0 >= 1
-        %xdefine cpuname %1
-        %assign cpuflags cpuflags_%1
-        %if %0 >= 2
-            %xdefine cpuname %1_%2
-            %assign cpuflags cpuflags | cpuflags_%2
-        %endif
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
         %xdefine SUFFIX _ %+ cpuname
+
         %if cpuflag(avx)
             %assign avx_enabled 1
         %endif
-        %if mmsize == 16 && notcpuflag(sse2)
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
             %define mova movaps
             %define movu movups
             %define movnta movntps
         %endif
         %if cpuflag(aligned)
             %define movu mova
-        %elifidn %1, sse3
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
             %define movu lddqu
         %endif
+    %endif
+
+    %ifdef __NASM_VER__
+        %use smartalign
+        ALIGNMODE k7
+    %elif ARCH_X86_64 || cpuflag(sse2)
+        CPU amdnop
     %else
-        %xdefine SUFFIX
-        %undef cpuname
-        %undef cpuflags
+        CPU basicnop
     %endif
 %endmacro
 
-; merge mmx and sse*
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -746,12 +947,12 @@
     %assign %%i 0
     %rep 8
     CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nmm, %%i, %%i
+    CAT_XDEFINE nnmm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     %rep 8
     CAT_UNDEF m, %%i
-    CAT_UNDEF nmm, %%i
+    CAT_UNDEF nnmm, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -772,20 +973,12 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nxmm, %%i, %%i
+    CAT_XDEFINE nnxmm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
 
-; FIXME: INIT_AVX can be replaced by INIT_XMM avx
-%macro INIT_AVX 0
-    INIT_XMM
-    %assign avx_enabled 1
-    %define PALIGNR PALIGNR_SSSE3
-    %define RESET_MM_PERMUTATION INIT_AVX
-%endmacro
-
 %macro INIT_YMM 0-1+
     %assign avx_enabled 1
     %define RESET_MM_PERMUTATION INIT_YMM %1
@@ -794,14 +987,14 @@
     %if ARCH_X86_64
     %define num_mmregs 16
     %endif
-    %define mova vmovaps
-    %define movu vmovups
+    %define mova movdqa
+    %define movu movdqu
     %undef movh
-    %define movnta vmovntps
+    %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nymm, %%i, %%i
+    CAT_XDEFINE nnymm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -809,6 +1002,26 @@
 
 INIT_XMM
 
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+%assign i i+1
+%endrep
+
 ; I often want to use macros that permute their arguments. e.g. there's no
 ; efficient way to implement butterfly or transpose or dct without swapping some
 ; arguments.
@@ -825,42 +1038,42 @@
 
 %macro PERMUTE 2-* ; takes a list of pairs to swap
 %rep %0/2
-    %xdefine tmp%2 m%2
-    %xdefine ntmp%2 nm%2
+    %xdefine %%tmp%2 m%2
     %rotate 2
 %endrep
 %rep %0/2
-    %xdefine m%1 tmp%2
-    %xdefine nm%1 ntmp%2
-    %undef tmp%2
-    %undef ntmp%2
+    %xdefine m%1 %%tmp%2
+    CAT_XDEFINE nn, m%1, %1
     %rotate 2
 %endrep
 %endmacro
 
-%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
-%rep %0-1
-%ifdef m%1
-    %xdefine tmp m%1
-    %xdefine m%1 m%2
-    %xdefine m%2 tmp
-    CAT_XDEFINE n, m%1, %1
-    CAT_XDEFINE n, m%2, %2
-%else
-    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
-    ; Be careful using this mode in nested macros though, as in some cases there may be
-    ; other copies of m# that have already been dereferenced and don't get updated correctly.
-    %xdefine %%n1 n %+ %1
-    %xdefine %%n2 n %+ %2
-    %xdefine tmp m %+ %%n1
-    CAT_XDEFINE m, %%n1, m %+ %%n2
-    CAT_XDEFINE m, %%n2, tmp
-    CAT_XDEFINE n, m %+ %%n1, %%n1
-    CAT_XDEFINE n, m %+ %%n2, %%n2
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+%ifnum %1 ; SWAP 0, 1, ...
+    SWAP_INTERNAL_NUM %1, %2
+%else ; SWAP m0, m1, ...
+    SWAP_INTERNAL_NAME %1, %2
 %endif
-    %undef tmp
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
     %rotate 1
-%endrep
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+    %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
 %endmacro
 
 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
@@ -884,7 +1097,7 @@
         %assign %%i 0
         %rep num_mmregs
             CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE n, m %+ %%i, %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
         %assign %%i %%i+1
         %endrep
     %endif
@@ -945,246 +1158,365 @@
 %endrep
 %undef i
 
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
-;%4 == number of operands given
-;%5+: operands
-%macro RUN_AVX_INSTR 6-7+
-    %ifid %5
-        %define %%size sizeof%5
-    %else
-        %define %%size mmsize
-    %endif
-    %if %%size==32
-        %if %0 >= 7
-            v%1 %5, %6, %7
-        %else
-            v%1 %5, %6
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
         %endif
-    %else
-        %if %%size==8
-            %define %%regmov movq
-        %elif %2
-            %define %%regmov movaps
-        %else
-            %define %%regmov movdqa
-        %endif
+        %rotate 1
+    %endrep
+%endmacro
 
-        %if %4>=3+%3
-            %ifnidn %5, %6
-                %if avx_enabled && sizeof%5==16
-                    v%1 %5, %6, %7
-                %else
-                    %%regmov %5, %6
-                    %1 %5, %7
-                %endif
-            %else
-                %1 %5, %7
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
             %endif
-        %elif %3
-            %1 %5, %6, %7
-        %else
-            %1 %5, %6
         %endif
     %endif
-%endmacro
 
-; 3arg AVX ops with a memory arg can only have it in src2,
-; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
-; So, if the op is symmetric and the wrong one is memory, swap them.
-%macro RUN_AVX_INSTR1 8
-    %assign %%swap 0
-    %if avx_enabled
-        %ifnid %6
-            %assign %%swap 1
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
+            %endif
+            %if %5 && %4 == 0
+                %ifnid %8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
         %endif
-    %elifnidn %5, %6
-        %ifnid %7
-            %assign %%swap 1
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
         %endif
-    %endif
-    %if %%swap && %3 == 0 && %8 == 1
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
+    %elif %0 == 7
+        __instr %6, %7
     %else
-        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+        __instr %6
     %endif
 %endmacro
 
 ;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
-;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 4
-    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
-        %ifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
         %elifidn %4, fnord
-            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
         %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
         %else
-            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
         %endif
     %endmacro
 %endmacro
 
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 0, 0
-AVX_INSTR cmpps, 1, 0, 0
-AVX_INSTR cmpsd, 1, 0, 0
-AVX_INSTR cmpss, 1, 0, 0
-AVX_INSTR cvtdq2ps, 1, 0, 0
-AVX_INSTR cvtps2dq, 1, 0, 0
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pcmpestri, 0, 0, 0
-AVX_INSTR pcmpestrm, 0, 0, 0
-AVX_INSTR pcmpistri, 0, 0, 0
-AVX_INSTR pcmpistrm, 0, 0, 0
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
 
 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
 ; base-4 constants for shuffles
 %assign i 0
@@ -1208,13 +1540,69 @@
     %macro %1 4-7 %1, %2, %3
         %if cpuflag(xop)
             v%5 %1, %2, %3, %4
-        %else
+        %elifnidn %1, %4
             %6 %1, %2, %3
             %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
         %endif
     %endmacro
 %endmacro
 
-FMA_INSTR  pmacsdd,  pmulld, paddd
 FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
 FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+    %macro %1 4-8 %1, %2, %3, %4
+        %if cpuflag(fma4)
+            v%5 %1, %2, %3, %4
+        %elifidn %1, %2
+            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+        %elifidn %1, %3
+            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+        %elifidn %1, %4
+            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+        %else
+            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
+%if ARCH_X86_64 == 0
+%macro vpbroadcastq 2
+%if sizeof%1 == 16
+    movddup %1, %2
+%else
+    vbroadcastsd %1, %2
+%endif
+%endmacro
+%endif

diff --git a/libvpx/tools_common.c b/libvpx/tools_common.c
index 7cfd066..20b259c 100644
--- a/libvpx/tools_common.c
+++ b/libvpx/tools_common.c

@@ -16,11 +16,11 @@
 
 #include "./tools_common.h"
 
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 #include "vpx/vp8cx.h"
 #endif
 
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -83,7 +83,7 @@
   struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
   int plane = 0;
   int shortread = 0;
-  const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+  const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     uint8_t *ptr;
@@ -130,7 +130,13 @@
   return shortread;
 }
 
+#if CONFIG_ENCODERS
+
 static const VpxInterface vpx_encoders[] = {
+#if CONFIG_VP10_ENCODER
+  {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
+#endif
+
 #if CONFIG_VP8_ENCODER
   {"vp8", VP8_FOURCC, &vpx_codec_vp8_cx},
 #endif
@@ -140,7 +146,7 @@
 #endif
 };
 
-int get_vpx_encoder_count() {
+int get_vpx_encoder_count(void) {
   return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]);
 }
 
@@ -160,6 +166,10 @@
   return NULL;
 }
 
+#endif  // CONFIG_ENCODERS
+
+#if CONFIG_DECODERS
+
 static const VpxInterface vpx_decoders[] = {
 #if CONFIG_VP8_DECODER
   {"vp8", VP8_FOURCC, &vpx_codec_vp8_dx},
@@ -168,9 +178,13 @@
 #if CONFIG_VP9_DECODER
   {"vp9", VP9_FOURCC, &vpx_codec_vp9_dx},
 #endif
+
+#if CONFIG_VP10_DECODER
+  {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
+#endif
 };
 
-int get_vpx_decoder_count() {
+int get_vpx_decoder_count(void) {
   return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]);
 }
 
@@ -202,6 +216,8 @@
   return NULL;
 }
 
+#endif  // CONFIG_DECODERS
+
 // TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
 // of vpx_image_t support
 int vpx_img_plane_width(const vpx_image_t *img, int plane) {
@@ -224,7 +240,8 @@
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
@@ -241,7 +258,8 @@
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
@@ -266,3 +284,219 @@
     return kMaxPSNR;
   }
 }
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                               int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                              int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                     int input_shift) {
+  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}
+
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+  int plane;
+  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+      dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (uint8_t)(*p_src++);
+      }
+    }
+  }
+}
+
+static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                 int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                       int down_shift) {
+  if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_downshift(dst, src, down_shift);
+  } else {
+    lowbd_img_downshift(dst, src, down_shift);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/tools_common.h b/libvpx/tools_common.h
index 558413e..98347b6 100644
--- a/libvpx/tools_common.h
+++ b/libvpx/tools_common.h

@@ -16,6 +16,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"
 
 #if CONFIG_ENCODERS
 #include "./y4minput.h"
@@ -34,7 +35,6 @@
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
 #include <io.h>  /* NOLINT */
-#define snprintf _snprintf
 #define isatty   _isatty
 #define fileno   _fileno
 #else
@@ -62,6 +62,7 @@
 
 #define VP8_FOURCC 0x30385056
 #define VP9_FOURCC 0x30395056
+#define VP10_FOURCC 0x303a5056
 
 enum VideoFileType {
   FILE_TYPE_RAW,
@@ -89,6 +90,7 @@
   enum VideoFileType file_type;
   uint32_t width;
   uint32_t height;
+  struct VpxRational pixel_aspect_ratio;
   vpx_img_fmt_t fmt;
   vpx_bit_depth_t bit_depth;
   int only_i420;
@@ -103,17 +105,25 @@
 extern "C" {
 #endif
 
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...);
-void fatal(const char *fmt, ...);
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
 
 /* The tool including this file must define usage_exit() */
-void usage_exit();
+void usage_exit(void) VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
@@ -123,11 +133,11 @@
   vpx_codec_iface_t *(*const codec_interface)();
 } VpxInterface;
 
-int get_vpx_encoder_count();
+int get_vpx_encoder_count(void);
 const VpxInterface *get_vpx_encoder_by_index(int i);
 const VpxInterface *get_vpx_encoder_by_name(const char *name);
 
-int get_vpx_decoder_count();
+int get_vpx_decoder_count(void);
 const VpxInterface *get_vpx_decoder_by_index(int i);
 const VpxInterface *get_vpx_decoder_by_name(const char *name);
 const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc);
@@ -141,6 +151,12 @@
 
 double sse_to_psnr(double samples, double peak, double mse);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift);
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift);
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
+#endif
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif

diff --git a/libvpx/usage.dox b/libvpx/usage.dox
index 237b8dc..8823520 100644
--- a/libvpx/usage.dox
+++ b/libvpx/usage.dox

@@ -12,13 +12,13 @@
     - \ref usage_init
     - \ref usage_errors
 
-    Fore more information on decoder and encoder specific usage, see the
+    For more information on decoder and encoder specific usage, see the
     following pages:
     \if decoder
-    - \subpage usage_decode
+    \li \subpage usage_decode
     \endif
-    \if decoder
-    - \subpage usage_encode
+    \if encoder
+    \li \subpage usage_encode
     \endif
 
     \section usage_types Important Data Types
@@ -80,10 +80,13 @@
 
 
     The available initialization methods are:
-    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
-    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
-    \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
-
+    \if encoder
+    \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+    \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+    \endif
+    \if decoder
+    \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver())
+    \endif
 
 
     \section usage_errors Error Handling

diff --git a/libvpx/usage_cx.dox b/libvpx/usage_cx.dox
index 62f3e45..92b0d34 100644
--- a/libvpx/usage_cx.dox
+++ b/libvpx/usage_cx.dox

@@ -1,4 +1,4 @@
-/*! \page usage_encode Encode
+/*! \page usage_encode Encoding
 
     The vpx_codec_encode() function is at the core of the encode loop. It
     processes raw images passed by the application, producing packets of

diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c
index 54afc13..8dfd4ce 100644
--- a/libvpx/vp8/common/alloccommon.c
+++ b/libvpx/vp8/common/alloccommon.c

@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "alloccommon.h"
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxc_int.h"
@@ -103,9 +104,9 @@
         goto allocation_fail;
 
     oci->post_proc_buffer_int_used = 0;
-    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
-    vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
-               oci->post_proc_buffer.frame_size);
+    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    memset(oci->post_proc_buffer.buffer_alloc, 128,
+           oci->post_proc_buffer.frame_size);
 
     /* Allocate buffer to store post-processing filter coefficients.
      *
@@ -176,7 +177,7 @@
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
     /* Initialize reference frame sign bias structure to defaults */
-    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
 
     /* Default disable buffer to buffer copying */
     oci->copy_buffer_to_gf = 0;

diff --git a/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
index 2510ad8..db48ded 100644
--- a/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm

@@ -165,7 +165,7 @@
     str     r1, [r2], r12           ; store output to dst
     bne     vp8_dequant_idct_loop2_v6
 
-; vpx_memset
+; memset
     sub     r0, r0, #32
     add     sp, sp, #4
 

diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
deleted file mode 100644
index 3991957..0000000
--- a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ /dev/null

@@ -1,154 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-

diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
deleted file mode 100644
index 915ee49..0000000
--- a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ /dev/null

@@ -1,101 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance8x8_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_armv6| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/common/arm/filter_arm.c b/libvpx/vp8/common/arm/filter_arm.c
index 7fe3967..d6a6781 100644
--- a/libvpx/vp8/common/arm/filter_arm.c
+++ b/libvpx/vp8/common/arm/filter_arm.c

@@ -99,7 +99,7 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */
 
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
@@ -147,7 +147,7 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
     VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
@@ -189,7 +189,7 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[24*16]);    /* Temp data buffer used in filtering */
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
     VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

diff --git a/libvpx/vp8/common/arm/loopfilter_arm.c b/libvpx/vp8/common/arm/loopfilter_arm.c
index f37ca63..5840c2b 100644
--- a/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/libvpx/vp8/common/arm/loopfilter_arm.c

@@ -25,22 +25,18 @@
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
 #endif
 
-#if HAVE_NEON_ASM || HAVE_NEON
+#if HAVE_NEON
 typedef void loopfilter_y_neon(unsigned char *src, int pitch,
         unsigned char blimit, unsigned char limit, unsigned char thresh);
 typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
         unsigned char blimit, unsigned char limit, unsigned char thresh,
         unsigned char *v);
-#endif
 
-#if HAVE_NEON_ASM
 extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
 extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-#endif
 
-#if HAVE_NEON
 extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
@@ -150,9 +146,7 @@
     if (u_ptr)
         vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
-#endif
 
-#if HAVE_NEON_ASM
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)

diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
index d77f2ba..9824a31 100644
--- a/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c

@@ -10,7 +10,7 @@
 
 #include <arm_neon.h>
 
-static const uint16_t bifilter4_coeff[8][2] = {
+static const uint8_t bifilter4_coeff[8][2] = {
     {128,   0},
     {112,  16},
     { 96,  32},
@@ -64,8 +64,8 @@
         q1u8 = vcombine_u8(d2u8, d3u8);
         q2u8 = vcombine_u8(d4u8, d5u8);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
         q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@@ -155,8 +155,8 @@
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q5u8 = vld1q_u8(src_ptr);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@@ -245,8 +245,8 @@
         q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);

diff --git a/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
deleted file mode 100644
index a8730aa..0000000
--- a/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null

@@ -1,595 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
-    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
-    push            {r4-r8, lr}
-    vpush           {d8-d15}
-
-    cmp             r3, #0
-    beq             case_dc_pred
-    cmp             r3, #1
-    beq             case_v_pred
-    cmp             r3, #2
-    beq             case_h_pred
-    cmp             r3, #3
-    beq             case_tm_pred
-
-case_dc_pred
-    ldr             r4, [sp, #88]       ; Up
-    ldr             r5, [sp, #92]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-case_v_pred
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_h_pred
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_tm_pred
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-    ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
-    push            {r4-r8, lr}
-    vpush           {d8-d15}
-
-    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
-    cmp             r3, #0
-    beq             case_dc_pred_s
-    cmp             r3, #1
-    beq             case_v_pred_s
-    cmp             r3, #2
-    beq             case_h_pred_s
-    cmp             r3, #3
-    beq             case_tm_pred_s
-
-case_dc_pred_s
-    ldr             r4, [sp, #88]       ; Up
-    ldr             r5, [sp, #92]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left_s
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up_s
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up_s
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left_s
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left_s
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left_s
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-case_v_pred_s
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_h_pred_s
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_tm_pred_s
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop_s
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop_s
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-    ENDP
-
-
-    END

diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
deleted file mode 100644
index 3a39210..0000000
--- a/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null

@@ -1,81 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq,
-;                            unsigned char *dst, int stride);
-; r0   *q
-; r1   dq
-; r2   *dst
-; r3   stride
-|idct_dequant_0_2x_neon| PROC
-    push            {r4, r5}
-    vpush           {d8-d15}
-
-    add             r12, r2, #4
-    vld1.32         {d2[0]}, [r2], r3
-    vld1.32         {d8[0]}, [r12], r3
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d4[1]}, [r2], r3
-    vld1.32         {d10[1]}, [r12], r3
-
-    ldrh            r12, [r0]               ; lo q
-    ldrh            r4, [r0, #32]           ; hi q
-    mov             r5, #0
-    strh            r5, [r0]
-    strh            r5, [r0, #32]
-
-    sxth            r12, r12                ; lo
-    mul             r0, r12, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q0, r0
-    sxth            r4, r4                  ; hi
-    mul             r0, r4, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r0, r2, #4
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d10[1]}, [r0]
-
-    vpop            {d8-d15}
-    pop             {r4, r5}
-    bx              lr
-
-    ENDP            ; |idct_dequant_0_2x_neon|
-    END

diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000..e6f862f
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c

@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+        int16_t *q,
+        int16_t dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int i, a0, a1;
+    int16x8x2_t q2Add;
+    int32x2_t d2s32 = vdup_n_s32(0),
+              d4s32 = vdup_n_s32(0);
+    uint8x8_t d2u8, d4u8;
+    uint16x8_t q1u16, q2u16;
+
+    a0 = ((q[0] * dq) + 4) >> 3;
+    a1 = ((q[16] * dq) + 4) >> 3;
+    q[0] = q[16] = 0;
+    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+    for (i = 0; i < 2; i++, dst += 4) {
+        dst0 = dst;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d2s32));
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d4s32));
+
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+        d2s32 = vreinterpret_s32_u8(d2u8);
+        d4s32 = vreinterpret_s32_u8(d4u8);
+
+        dst0 = dst;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+    }
+    return;
+}

diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
deleted file mode 100644
index 8da0fa0..0000000
--- a/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null

@@ -1,199 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq,
-;                               unsigned char *dst, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-|idct_dequant_full_2x_neon| PROC
-    vpush           {d8-d15}
-
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2],  r3     ; l pre
-    vld1.32         {d28[1]}, [r12], r3     ; r pre
-    vld1.32         {d29[0]}, [r2],  r3
-    vld1.32         {d29[1]}, [r12], r3
-    vld1.32         {d30[0]}, [r2],  r3
-    vld1.32         {d30[1]}, [r12], r3
-    vld1.32         {d31[0]}, [r2],  r3
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    vld1.16         {d0}, [r1]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r1, r2, #4              ; hi
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    vst1.32         {d0[0]}, [r2], r3       ; lo
-    vst1.32         {d0[1]}, [r1], r3       ; hi
-    vst1.32         {d1[0]}, [r2], r3
-    vst1.32         {d1[1]}, [r1], r3
-    vst1.32         {d2[0]}, [r2], r3
-    vst1.32         {d2[1]}, [r1], r3
-    vst1.32         {d3[0]}, [r2]
-    vst1.32         {d3[1]}, [r1]
-
-    vpop            {d8-d15}
-    bx             lr
-
-    ENDP           ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END

diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000..a60ed46
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c

@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+        int16_t *q,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0, *dst1;
+    int32x2_t d28, d29, d30, d31;
+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x4x2_t q2tmp0, q2tmp1;
+    int16x8x2_t q2tmp2, q2tmp3;
+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+    d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+    // load dq
+    q0 = vld1q_s16(dq);
+    dq += 8;
+    q1 = vld1q_s16(dq);
+
+    // load q
+    q2 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q3 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q4 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q5 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+
+    // load src from dst
+    dst0 = dst;
+    dst1 = dst + 4;
+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+    q2 = vmulq_s16(q2, q0);
+    q3 = vmulq_s16(q3, q1);
+    q4 = vmulq_s16(q4, q0);
+    q5 = vmulq_s16(q5, q1);
+
+    // vswp
+    dLow0 = vget_low_s16(q2);
+    dHigh0 = vget_high_s16(q2);
+    dLow1 = vget_low_s16(q4);
+    dHigh1 = vget_high_s16(q4);
+    q2 = vcombine_s16(dLow0, dLow1);
+    q4 = vcombine_s16(dHigh0, dHigh1);
+
+    dLow0 = vget_low_s16(q3);
+    dHigh0 = vget_high_s16(q3);
+    dLow1 = vget_low_s16(q5);
+    dHigh1 = vget_high_s16(q5);
+    q3 = vcombine_s16(dLow0, dLow1);
+    q5 = vcombine_s16(dHigh0, dHigh1);
+
+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+    q10 = vqaddq_s16(q2, q3);
+    q11 = vqsubq_s16(q2, q3);
+
+    q8 = vshrq_n_s16(q8, 1);
+    q9 = vshrq_n_s16(q9, 1);
+
+    q4 = vqaddq_s16(q4, q8);
+    q5 = vqaddq_s16(q5, q9);
+
+    q2 = vqsubq_s16(q6, q5);
+    q3 = vqaddq_s16(q7, q4);
+
+    q4 = vqaddq_s16(q10, q3);
+    q5 = vqaddq_s16(q11, q2);
+    q6 = vqsubq_s16(q11, q2);
+    q7 = vqsubq_s16(q10, q3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    // loop 2
+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+    q10 = vshrq_n_s16(q10, 1);
+    q11 = vshrq_n_s16(q11, 1);
+
+    q10 = vqaddq_s16(q2tmp2.val[1], q10);
+    q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+    q8 = vqsubq_s16(q8, q11);
+    q9 = vqaddq_s16(q9, q10);
+
+    q4 = vqaddq_s16(q2, q9);
+    q5 = vqaddq_s16(q3, q8);
+    q6 = vqsubq_s16(q3, q8);
+    q7 = vqsubq_s16(q2, q9);
+
+    q4 = vrshrq_n_s16(q4, 3);
+    q5 = vrshrq_n_s16(q5, 3);
+    q6 = vrshrq_n_s16(q6, 3);
+    q7 = vrshrq_n_s16(q7, 3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+                                          vreinterpret_u8_s32(d28)));
+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+                                          vreinterpret_u8_s32(d29)));
+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+                                          vreinterpret_u8_s32(d30)));
+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+                                          vreinterpret_u8_s32(d31)));
+
+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+    dst0 = dst;
+    dst1 = dst + 4;
+    vst1_lane_s32((int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    vst1_lane_s32((int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d31, 0);
+    vst1_lane_s32((int32_t *)dst1, d31, 1);
+    return;
+}

diff --git a/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
deleted file mode 100644
index c4f09c7..0000000
--- a/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null

@@ -1,409 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
-    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #68]              ; load thresh
-    add         r12, r2, r1
-    add         r1, r1, r1
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vld1.u8     {q3}, [r2@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r2@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r2@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r2@128]                  ; q2
-    vld1.u8     {q10}, [r12@128]                ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r12, r12, r1, lsl #1
-
-    bl          vp8_loop_filter_neon
-
-    vst1.u8     {q5}, [r2@128], r1              ; store op1
-    vst1.u8     {q6}, [r12@128], r1             ; store op0
-    vst1.u8     {q7}, [r2@128], r1              ; store oq0
-    vst1.u8     {q8}, [r12@128], r1             ; store oq1
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                      ; duplicate blimit
-    vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #68]              ; load thresh
-    ldr         r2, [sp, #72]               ; load v ptr
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r3@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1             ; p3
-    vld1.u8     {d8}, [r3@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1             ; p2
-    vld1.u8     {d10}, [r3@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1            ; p1
-    vld1.u8     {d12}, [r3@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1            ; p0
-    vld1.u8     {d14}, [r3@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1            ; q0
-    vld1.u8     {d16}, [r3@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1            ; q1
-    vld1.u8     {d18}, [r3@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1            ; q2
-    vld1.u8     {d20}, [r3@64]                 ; q3
-    vld1.u8     {d21}, [r12@64]                ; q3
-
-    bl          vp8_loop_filter_neon
-
-    sub         r0, r0, r1, lsl #1
-    sub         r2, r2, r1, lsl #1
-
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r2@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r2@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64]                 ; store u oq1
-    vst1.u8     {d17}, [r2@64]                 ; store v oq1
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                           const signed char *flimit,
-;                                           const signed char *limit,
-;                                           const signed char *thresh,
-;                                           int count)
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, #4                 ; src ptr down by 4 columns
-    add         r1, r1, r1
-    ldr         r3, [sp, #68]              ; load thresh
-    add         r12, r2, r1, asr #1
-
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r12], r1
-
-    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d19}, [r2]
-    vld1.u8     {d21}, [r12]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp8_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-
-    sub         r0, r0, #2                 ; dst ptr
-
-    vswp        d14, d12
-    vswp        d16, d15
-
-    add         r12, r0, r1, asr #1
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-;                                            const signed char *flimit,
-;                                            const signed char *limit,
-;                                            const signed char *thresh,
-;                                            unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                      ; duplicate blimit
-    sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #72]               ; load v ptr
-    vdup.u8     q1, r3                      ; duplicate limit
-    sub         r3, r2, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r12], r1             ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r12]
-    vld1.u8     {d21}, [r3]
-
-    ldr        r12, [sp, #68]              ; load thresh
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp8_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-    vswp        d14, d12
-    vswp        d16, d15
-
-    sub         r0, r0, #2
-    sub         r2, r2, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-; void vp8_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0    flimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-|vp8_loop_filter_neon| PROC
-
-    ; vp8_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q4
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3
-
-    vmov.u8     q10, #0x80                   ; 0x80
-
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-
-    vcge.u8     q15, q1, q15
-
-    ; vp8_filter() function
-    ; convert to signed
-    veor        q7, q7, q10                 ; qs0
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
-
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-
-    veor        q8, q8, q10                 ; qs1
-
-    vmov.u8     q10, #3                     ; #3
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
-
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q9                ; vp8_filter_mask
-
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vmov.u8     q9, #4                      ; #4
-
-    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-
-
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
-
-    ; outer tap adjustments: ++vp8_filter >> 1
-    vrshr.s8    q1, q1, #1
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-    vmov.u8     q0, #0x80                   ; 0x80
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
-    END

diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
deleted file mode 100644
index 78d13c8..0000000
--- a/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null

@@ -1,156 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_loop_filter_bvs_neon|
-    EXPORT |vp8_loop_filter_mbvs_neon|
-    ARM
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *s, PRESERVE
-; r1    int p, PRESERVE
-; q1    limit, PRESERVE
-
-|vp8_loop_filter_simple_vertical_edge_neon| PROC
-    vpush       {d8-d15}
-
-    sub         r0, r0, #2                  ; move src pointer down by 2 columns
-    add         r12, r1, r1
-    add         r3, r0, r1
-
-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
-    vswp        d7, d10
-    vswp        d12, d9
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    sub         r0, r0, r1, lsl #4
-    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
-    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
-
-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
-    vmov.u8     q0, #0x80                   ; 0x80
-    vmov.s16    q11, #3
-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
-
-    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
-    vsubl.s8    q13, d9, d11
-
-    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
-    vmul.s16    q13, q13, q11
-
-    vmov.u8     q11, #0x03                  ; 0x03
-    vmov.u8     q12, #0x04                  ; 0x04
-
-    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d29
-
-    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d29, q13
-
-    add         r0, r0, #1
-    add         r3, r0, r1
-
-    vand        q14, q14, q15                 ; vp8_filter &= mask
-
-    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
-
-    ;calculate output
-    vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    add         r12, r1, r1
-    vswp        d13, d14
-
-    ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0], r12
-    vst2.8      {d12[1], d13[1]}, [r3], r12
-    vst2.8      {d12[2], d13[2]}, [r0], r12
-    vst2.8      {d12[3], d13[3]}, [r3], r12
-    vst2.8      {d12[4], d13[4]}, [r0], r12
-    vst2.8      {d12[5], d13[5]}, [r3], r12
-    vst2.8      {d12[6], d13[6]}, [r0], r12
-    vst2.8      {d12[7], d13[7]}, [r3], r12
-    vst2.8      {d14[0], d15[0]}, [r0], r12
-    vst2.8      {d14[1], d15[1]}, [r3], r12
-    vst2.8      {d14[2], d15[2]}, [r0], r12
-    vst2.8      {d14[3], d15[3]}, [r3], r12
-    vst2.8      {d14[4], d15[4]}, [r0], r12
-    vst2.8      {d14[5], d15[5]}, [r3], r12
-    vst2.8      {d14[6], d15[6]}, [r0], r12
-    vst2.8      {d14[7], d15[7]}, [r3]
-
-    vpop        {d8-d15}
-    bx          lr
-    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_bvs_neon| PROC
-    push        {r4, lr}
-    ldrb        r3, [r2]                   ; load blim from mem
-    mov         r4, r0
-    add         r0, r0, #4
-    vdup.s8     q1, r3                     ; duplicate blim
-    bl          vp8_loop_filter_simple_vertical_edge_neon
-    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
-    add         r0, r4, #8
-    bl          vp8_loop_filter_simple_vertical_edge_neon
-    add         r0, r4, #12
-    pop         {r4, lr}
-    b           vp8_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp8_loop_filter_bvs_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_mbvs_neon| PROC
-    ldrb        r3, [r2]                   ; load mblim from mem
-    vdup.s8     q1, r3                     ; duplicate mblim
-    b           vp8_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp8_loop_filter_bvs_neon|
-    END

diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000..921bcad
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c

@@ -0,0 +1,283 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result) {
+    /*
+     * uint8x8x2_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    ---
+    * after vtrn_u8
+    00 10 02 12 | 04 14 06 16
+    01 11 03 13 | 05 15 07 17
+    */
+    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+                                       result.val[1]);
+    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  write_2x4(dst, pitch, result);
+  dst += pitch * 8;
+  write_2x4(dst, pitch, result2);
+}
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
+
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
+}
+#endif  // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+    uint8x8x4_t x;
+    const uint8x8_t a = vld1_u8(src);
+    const uint8x8_t b = vld1_u8(src + pitch * 1);
+    const uint8x8_t c = vld1_u8(src + pitch * 2);
+    const uint8x8_t d = vld1_u8(src + pitch * 3);
+    const uint8x8_t e = vld1_u8(src + pitch * 4);
+    const uint8x8_t f = vld1_u8(src + pitch * 5);
+    const uint8x8_t g = vld1_u8(src + pitch * 6);
+    const uint8x8_t h = vld1_u8(src + pitch * 7);
+    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+                                          vreinterpret_u32_u8(e));
+    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+                                          vreinterpret_u32_u8(f));
+    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+                                          vreinterpret_u32_u8(g));
+    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+                                          vreinterpret_u32_u8(h));
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+                                          vreinterpret_u16_u32(r26_u32.val[0]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+                                          vreinterpret_u16_u32(r37_u32.val[0]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    /*
+     * after vtrn_u32
+    00 01 02 03 | 40 41 42 43
+    10 11 12 13 | 50 51 52 53
+    20 21 22 23 | 60 61 62 63
+    30 31 32 33 | 70 71 72 73
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 40 41 60 61
+    02 03 22 23 | 42 43 62 63
+    10 11 30 31 | 50 51 70 71
+    12 13 32 33 | 52 52 72 73
+
+    00 01 20 21 | 40 41 60 61
+    10 11 30 31 | 50 51 70 71
+    02 03 22 23 | 42 43 62 63
+    12 13 32 33 | 52 52 72 73
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 40 50 60 70
+    01 11 21 31 | 41 51 61 71
+    02 12 22 32 | 42 52 62 72
+    03 13 23 33 | 43 53 63 73
+    */
+    x.val[0] = r01_u8.val[0];
+    x.val[1] = r01_u8.val[1];
+    x.val[2] = r23_u8.val[0];
+    x.val[3] = r23_u8.val[1];
+
+    return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+    uint8x8x4_t x;
+    x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+        unsigned char *s,
+        int p,
+        const unsigned char *blimit) {
+    unsigned char *src1;
+    uint8x16_t qblimit, q0u8;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+    int16x8_t q2s16, q13s16, q11s16;
+    int8x8_t d28s8, d29s8;
+    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
+    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
+    uint8x8x2_t d2u8x2;  // d12, d13
+    uint8x8x2_t d3u8x2;  // d14, d15
+
+    qblimit = vdupq_n_u8(*blimit);
+
+    src1 = s - 2;
+    d0u8x4 = read_4x8(src1, p);
+    src1 += p * 8;
+    d1u8x4 = read_4x8(src1, p);
+
+    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
+    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
+    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
+    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13
+
+    q15u8 = vabdq_u8(q5u8, q4u8);
+    q14u8 = vabdq_u8(q3u8, q6u8);
+
+    q15u8 = vqaddq_u8(q15u8, q15u8);
+    q14u8 = vshrq_n_u8(q14u8, 1);
+    q0u8 = vdupq_n_u8(0x80);
+    q11s16 = vdupq_n_s16(3);
+    q15u8 = vqaddq_u8(q15u8, q14u8);
+
+    q3u8 = veorq_u8(q3u8, q0u8);
+    q4u8 = veorq_u8(q4u8, q0u8);
+    q5u8 = veorq_u8(q5u8, q0u8);
+    q6u8 = veorq_u8(q6u8, q0u8);
+
+    q15u8 = vcgeq_u8(qblimit, q15u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+                      vreinterpretq_s8_u8(q6u8));
+
+    q2s16 = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q11u8 = vdupq_n_u8(3);
+    q12u8 = vdupq_n_u8(4);
+
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+    d28s8 = vqmovn_s16(q2s16);
+    d29s8 = vqmovn_s16(q13s16);
+    q14s8 = vcombine_s8(d28s8, d29s8);
+
+    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q14s8 = vshrq_n_s8(q3s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
+    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
+    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
+    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15
+
+    src1 = s - 1;
+    write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}

diff --git a/libvpx/vp8/common/arm/neon/reconintra_neon.c b/libvpx/vp8/common/arm/neon/reconintra_neon.c
new file mode 100644
index 0000000..af52cd5
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/reconintra_neon.c

@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
+                                           unsigned char * yabove_row,
+                                           unsigned char * yleft,
+                                           int left_stride,
+                                           unsigned char * ypred_ptr,
+                                           int y_stride) {
+  const int mode = x->mode_info_context->mbmi.mode;
+  int i;
+
+  switch (mode) {
+    case DC_PRED:
+    {
+      int shift = x->up_available + x->left_available;
+      uint8x16_t v_expected_dc = vdupq_n_u8(128);
+
+      if (shift) {
+        unsigned int average = 0;
+        int expected_dc;
+        if (x->up_available) {
+          const uint8x16_t v_above = vld1q_u8(yabove_row);
+          const uint16x8_t a = vpaddlq_u8(v_above);
+          const uint32x4_t b = vpaddlq_u16(a);
+          const uint64x2_t c = vpaddlq_u32(b);
+          const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                        vreinterpret_u32_u64(vget_high_u64(c)));
+          average = vget_lane_u32(d, 0);
+        }
+        if (x->left_available) {
+          for (i = 0; i < 16; ++i) {
+              average += yleft[0];
+              yleft += left_stride;
+          }
+        }
+        shift += 3;
+        expected_dc = (average + (1 << (shift - 1))) >> shift;
+        v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
+      }
+      for (i = 0; i < 16; ++i) {
+        vst1q_u8(ypred_ptr, v_expected_dc);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case V_PRED:
+    {
+      const uint8x16_t v_above = vld1q_u8(yabove_row);
+      for (i = 0; i < 16; ++i) {
+        vst1q_u8(ypred_ptr, v_above);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case H_PRED:
+    {
+      for (i = 0; i < 16; ++i) {
+        const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
+        yleft += left_stride;
+        vst1q_u8(ypred_ptr, v_yleft);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case TM_PRED:
+    {
+      const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
+      const uint8x16_t v_above = vld1q_u8(yabove_row);
+      for (i = 0; i < 16; ++i) {
+        const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
+        const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
+        const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
+        const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
+                                         vreinterpretq_s16_u16(v_ytop_left));
+        const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
+                                         vreinterpretq_s16_u16(v_ytop_left));
+        const uint8x8_t pred_lo = vqmovun_s16(b_lo);
+        const uint8x8_t pred_hi = vqmovun_s16(b_hi);
+
+        vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
+        ypred_ptr += y_stride;
+        yleft += left_stride;
+      }
+    }
+    break;
+  }
+}
+
+void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
+                                            unsigned char * uabove_row,
+                                            unsigned char * vabove_row,
+                                            unsigned char * uleft,
+                                            unsigned char * vleft,
+                                            int left_stride,
+                                            unsigned char * upred_ptr,
+                                            unsigned char * vpred_ptr,
+                                            int pred_stride) {
+  const int mode = x->mode_info_context->mbmi.uv_mode;
+  int i;
+
+  switch (mode) {
+    case DC_PRED:
+    {
+      int shift = x->up_available + x->left_available;
+      uint8x8_t v_expected_udc = vdup_n_u8(128);
+      uint8x8_t v_expected_vdc = vdup_n_u8(128);
+
+      if (shift) {
+        unsigned int average_u = 0;
+        unsigned int average_v = 0;
+        int expected_udc;
+        int expected_vdc;
+        if (x->up_available) {
+          const uint8x8_t v_uabove = vld1_u8(uabove_row);
+          const uint8x8_t v_vabove = vld1_u8(vabove_row);
+          const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
+          const uint32x4_t b = vpaddlq_u16(a);
+          const uint64x2_t c = vpaddlq_u32(b);
+          average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
+          average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
+        }
+        if (x->left_available) {
+          for (i = 0; i < 8; ++i) {
+              average_u += uleft[0];
+              uleft += left_stride;
+              average_v += vleft[0];
+              vleft += left_stride;
+          }
+        }
+        shift += 2;
+        expected_udc = (average_u + (1 << (shift - 1))) >> shift;
+        expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
+        v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
+        v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
+      }
+      for (i = 0; i < 8; ++i) {
+        vst1_u8(upred_ptr, v_expected_udc);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_expected_vdc);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case V_PRED:
+    {
+      const uint8x8_t v_uabove = vld1_u8(uabove_row);
+      const uint8x8_t v_vabove = vld1_u8(vabove_row);
+      for (i = 0; i < 8; ++i) {
+        vst1_u8(upred_ptr, v_uabove);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_vabove);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case H_PRED:
+    {
+      for (i = 0; i < 8; ++i) {
+        const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
+        const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
+        uleft += left_stride;
+        vleft += left_stride;
+        vst1_u8(upred_ptr, v_uleft);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_vleft);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case TM_PRED:
+    {
+      const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
+      const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
+      const uint8x8_t v_uabove = vld1_u8(uabove_row);
+      const uint8x8_t v_vabove = vld1_u8(vabove_row);
+      for (i = 0; i < 8; ++i) {
+        const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
+        const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
+        const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
+        const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
+        const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
+                                        vreinterpretq_s16_u16(v_utop_left));
+        const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
+                                        vreinterpretq_s16_u16(v_vtop_left));
+        const uint8x8_t pred_u = vqmovun_s16(b_u);
+        const uint8x8_t pred_v = vqmovun_s16(b_v);
+
+        vst1_u8(upred_ptr, pred_u);
+        vst1_u8(vpred_ptr, pred_v);
+        upred_ptr += pred_stride;
+        vpred_ptr += pred_stride;
+        uleft += left_stride;
+        vleft += left_stride;
+      }
+    }
+    break;
+  }
+}

diff --git a/libvpx/vp8/common/arm/neon/sad_neon.c b/libvpx/vp8/common/arm/neon/sad_neon.c
deleted file mode 100644
index 6595ac0..0000000
--- a/libvpx/vp8/common/arm/neon/sad_neon.c
+++ /dev/null

@@ -1,184 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_sad8x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 7; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad8x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 15; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad4x4_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x2_t d1;
-    uint64x1_t d3;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 3; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    d1 = vpaddl_u16(vget_low_u16(q12));
-    d3 = vpaddl_u32(d1);
-
-    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int vp8_sad16x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 15; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad16x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 7; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}

diff --git a/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index aad6133..4c2efc9 100644
--- a/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/libvpx/vp8/common/arm/neon/sixtappredict_neon.c

@@ -9,10 +9,7 @@
  */
 
 #include <arm_neon.h>
-
-#ifdef _MSC_VER
-#define __builtin_prefetch(x)
-#endif
+#include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
     {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */

diff --git a/libvpx/vp8/common/arm/neon/variance_neon.c b/libvpx/vp8/common/arm/neon/variance_neon.c
deleted file mode 100644
index afd2dc3..0000000
--- a/libvpx/vp8/common/arm/neon/variance_neon.c
+++ /dev/null

@@ -1,323 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-#define __builtin_prefetch(x)
-#endif
-
-unsigned int vp8_variance16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance16x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d2u8, d4u8, d6u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d2u8, d6u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d1u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d3u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d5u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d7u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d1u8, d5u8);
-        q13u16 = vsubl_u8(d2u8, d6u8);
-        q14u16 = vsubl_u8(d3u8, d7u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}

diff --git a/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
new file mode 100644
index 0000000..9d6807a
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c

@@ -0,0 +1,550 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+static INLINE void vp8_loop_filter_neon(
+        uint8x16_t qblimit,  // flimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3    = vabdq_u8(q9, q8);
+    q4    = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3    = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q10 = vdupq_n_u8(3);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vmovl_u8(vget_low_u8(q10));
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    q0u8 = vdupq_n_u8(0x80);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    src -= (pitch * 5);
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6  = vld1_u8(u);
+    u += pitch;
+    d7  = vld1_u8(v);
+    v += pitch;
+    d8  = vld1_u8(u);
+    u += pitch;
+    d9  = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    u -= (pitch * 5);
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+
+    v -= (pitch * 5);
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+                             const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+    /*
+     * uint8x8x4_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    20 21 22 23 | 24 25 26 27
+    30 31 32 33 | 34 35 36 37
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 04 05 24 25
+    02 03 22 23 | 06 07 26 27
+    10 11 30 31 | 14 15 34 35
+    12 13 32 33 | 16 17 36 37
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 04 14 24 34
+    01 11 21 31 | 05 15 25 35
+    02 12 22 32 | 06 16 26 36
+    03 13 23 33 | 07 17 27 37
+    */
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+                                          vreinterpret_u16_u8(result.val[2]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+                                          vreinterpret_u16_u8(result.val[3]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
+    vst4_lane_u8(dst, result, 0);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 1);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 2);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 3);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 4);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 5);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 6);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 7);
+#endif  // VPX_INCOMPATIBLE_GCC
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s, *d;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s = src - 4;
+    d6  = vld1_u8(s);
+    s += pitch;
+    d8  = vld1_u8(s);
+    s += pitch;
+    d10 = vld1_u8(s);
+    s += pitch;
+    d12 = vld1_u8(s);
+    s += pitch;
+    d14 = vld1_u8(s);
+    s += pitch;
+    d16 = vld1_u8(s);
+    s += pitch;
+    d18 = vld1_u8(s);
+    s += pitch;
+    d20 = vld1_u8(s);
+    s += pitch;
+    d7  = vld1_u8(s);
+    s += pitch;
+    d9  = vld1_u8(s);
+    s += pitch;
+    d11 = vld1_u8(s);
+    s += pitch;
+    d13 = vld1_u8(s);
+    s += pitch;
+    d15 = vld1_u8(s);
+    s += pitch;
+    d17 = vld1_u8(s);
+    s += pitch;
+    d19 = vld1_u8(s);
+    s += pitch;
+    d21 = vld1_u8(s);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+
+    d = src - 2;
+    write_4x8(d, pitch, q4ResultL);
+    d += pitch * 8;
+    write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d20 = vld1_u8(us);
+
+    vs = v - 4;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    ud = u - 2;
+    write_4x8(ud, pitch, q4ResultL);
+
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+    vd = v - 2;
+    write_4x8(vd, pitch, q4ResultH);
+}

diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
deleted file mode 100644
index adc5b7e..0000000
--- a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null

@@ -1,425 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-;-----------------
-
-    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
-
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-|vp8_sub_pixel_variance16x16_neon_func| PROC
-    push            {r4-r6, lr}
-    vpush           {d8-d15}
-
-    adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #80]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #84]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #88]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16_only
-
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {d31}, [r2]             ;load first_pass filter
-
-    beq             firstpass_bfilter16x16_only
-
-    sub             sp, sp, #272            ;reserve space on stack for temporary storage
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    mov             lr, sp
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    mov             r2, #3                  ;loop counter
-    vld1.u8         {d8, d9, d10}, [r0], r1
-
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    vdup.8          d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vqrshrn.u16    d21, q14, #7
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vst1.u8         {d18, d19, d20, d21}, [lr]!
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    bne             vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
-    vld1.u8         {d14, d15, d16}, [r0], r1
-
-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q10, d3, d0
-    vmull.u8        q11, d5, d0
-    vmull.u8        q12, d6, d0
-    vmull.u8        q13, d8, d0
-    vmull.u8        q14, d9, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-
-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q11, d5, d1
-    vmlal.u8        q13, d8, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-
-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q12, d6, d1
-    vmlal.u8        q14, d9, d1
-
-    vmull.u8        q1, d11, d0
-    vmull.u8        q2, d12, d0
-    vmull.u8        q3, d14, d0
-    vmull.u8        q4, d15, d0
-
-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
-    vext.8          d14, d14, d15, #1
-
-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q3, d14, d1
-
-    vext.8          d12, d12, d13, #1
-    vext.8          d15, d15, d16, #1
-
-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q4, d15, d1
-
-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d11, q10, #7
-    vqrshrn.u16    d12, q11, #7
-    vqrshrn.u16    d13, q12, #7
-    vqrshrn.u16    d14, q13, #7
-    vqrshrn.u16    d15, q14, #7
-    vqrshrn.u16    d16, q1, #7
-    vqrshrn.u16    d17, q2, #7
-    vqrshrn.u16    d18, q3, #7
-    vqrshrn.u16    d19, q4, #7
-
-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
-    vst1.u8         {d14, d15, d16, d17}, [lr]!
-    vst1.u8         {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
-    add             r3, r12, r3, lsl #3
-    sub             lr, lr, #272
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    sub             sp, sp, #256
-    mov             r3, sp
-
-    vld1.u8         {d22, d23}, [lr]!       ;load src data
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r12, #4                 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
-    vld1.u8         {d24, d25}, [lr]!
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [lr]!
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [lr]!
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [lr]!
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    subs            r12, r12, #1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5}, [r3]!
-    vst1.u8         {d6, d7}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_sp16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
-    mov             r2, #4                      ;loop counter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vld1.u8         {d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-    vst1.u8         {d14, d15}, [r3]!       ;store result
-    vqrshrn.u16    d21, q14, #7
-
-    vst1.u8         {d16, d17}, [r3]!
-    vst1.u8         {d18, d19}, [r3]!
-    vst1.u8         {d20, d21}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    add             r3, r12, r3, lsl #3
-    mov             r12, #4                     ;loop counter
-    vld1.u32        {d31}, [r3]                 ;load second_pass filter
-    vld1.u8         {d22, d23}, [r0], r1        ;load src data
-    mov             r3, sp
-
-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
-    vld1.u8         {d24, d25}, [r0], r1
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [r0], r1
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [r0], r1
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [r0], r1
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    subs            r12, r12, #1
-    vst1.u8         {d4, d5}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d6, d7}, [r3]!
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r12, #8
-
-sub_pixel_variance16x16_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q2}, [r4], r5
-    vld1.8          {q1}, [r3]!
-    vld1.8          {q3}, [r4], r5
-
-    vsubl.u8        q11, d0, d4                 ;diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             sub_pixel_variance16x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    add             sp, sp, #528
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4-r6,pc}
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
deleted file mode 100644
index b0829af..0000000
--- a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null

@@ -1,583 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
-    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
-    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_h_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_h_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #68]            ;load *sse from stack
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.8          {q11}, [r2], r3
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.8          {q12}, [r2], r3
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.8          {q13}, [r2], r3
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vext.8          q3, q2, q3, #1
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vld1.8          {q14}, [r2], r3
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-
-    vsubl.u8        q4, d0, d22                 ;diff
-    vsubl.u8        q5, d1, d23
-    vsubl.u8        q6, d2, d24
-    vsubl.u8        q7, d3, d25
-    vsubl.u8        q0, d4, d26
-    vsubl.u8        q1, d5, d27
-    vsubl.u8        q2, d6, d28
-    vsubl.u8        q3, d7, d29
-
-    vpadal.s16      q8, q4                     ;sum
-    vmlal.s16       q9, d8, d8                ;sse
-    vmlal.s16       q10, d9, d9
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_fpo16x16s_4_0_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_v_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_v_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    mov             r12, #4                     ;loop counter
-
-    vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #68]               ;load *sse from stack
-
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
-    vld1.u8         {q2}, [r0], r1
-    vld1.8          {q1}, [r2], r3
-    vld1.u8         {q4}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-    vld1.u8         {q6}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-    vld1.u8         {q15}, [r0], r1
-
-    vrhadd.u8       q0, q0, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q4
-    vrhadd.u8       q4, q4, q6
-    vrhadd.u8       q6, q6, q15
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                 ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-
-    vmov            q0, q15
-
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_spo16x16s_0_4_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_hv_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_hv_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-
-    ldr             lr, [sp, #68]           ;load *sse from stack
-    vmov.i8         q13, #0                      ;q8 - sum
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-
-    vmov.i8         q14, #0                      ;q9, q10 - sse
-    vmov.i8         q15, #0
-
-    mov             r12, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vld1.8          {q5}, [r2], r3
-    vrhadd.u8       q0, q0, q1
-    vld1.8          {q6}, [r2], r3
-    vrhadd.u8       q1, q1, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q3
-    vld1.8          {q8}, [r2], r3
-    vrhadd.u8       q3, q3, q4
-
-    vsubl.u8        q9, d0, d10                 ;diff
-    vsubl.u8        q10, d1, d11
-    vsubl.u8        q11, d2, d12
-    vsubl.u8        q12, d3, d13
-
-    vsubl.u8        q0, d4, d14                 ;diff
-    vsubl.u8        q1, d5, d15
-    vsubl.u8        q5, d6, d16
-    vsubl.u8        q6, d7, d17
-
-    vpadal.s16      q13, q9                     ;sum
-    vmlal.s16       q14, d18, d18                ;sse
-    vmlal.s16       q15, d19, d19
-
-    vpadal.s16      q13, q10                     ;sum
-    vmlal.s16       q14, d20, d20                ;sse
-    vmlal.s16       q15, d21, d21
-
-    vpadal.s16      q13, q11                     ;sum
-    vmlal.s16       q14, d22, d22                ;sse
-    vmlal.s16       q15, d23, d23
-
-    vpadal.s16      q13, q12                     ;sum
-    vmlal.s16       q14, d24, d24                ;sse
-    vmlal.s16       q15, d25, d25
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q13, q0                     ;sum
-    vmlal.s16       q14, d0, d0                ;sse
-    vmlal.s16       q15, d1, d1
-
-    vpadal.s16      q13, q1                     ;sum
-    vmlal.s16       q14, d2, d2                ;sse
-    vmlal.s16       q15, d3, d3
-
-    vpadal.s16      q13, q5                     ;sum
-    vmlal.s16       q14, d10, d10                ;sse
-    vmlal.s16       q15, d11, d11
-
-    vmov            q0, q4
-
-    vpadal.s16      q13, q6                     ;sum
-    vmlal.s16       q14, d12, d12                ;sse
-    vmlal.s16       q15, d13, d13
-
-    bne             vp8_filt16x16s_4_4_loop_neon
-
-    vadd.u32        q15, q14, q15                ;accumulate sse
-    vpaddl.s32      q0, q13                      ;accumulate sum
-
-    vpaddl.u32      q1, q15
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;==============================
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp8_sub_pixel_variance16x16s_neon| PROC
-    push            {r4, lr}
-    vpush           {d8-d15}
-
-    ldr             r4, [sp, #72]           ;load *dst_ptr from stack
-    ldr             r12, [sp, #76]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #80]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16s_only
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             firstpass_bfilter16x16s_only
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    mov             r3, sp
-    mov             r2, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vrhadd.u8       q0, q0, q1
-    vrhadd.u8       q1, q1, q2
-    vrhadd.u8       q2, q2, q3
-    vrhadd.u8       q3, q3, q4
-
-    subs            r2, r2, #1
-    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
-    vmov            q0, q4
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-
-    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
-    mov             r2, #2                  ;loop counter
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-    vext.8          q3, q2, q3, #1
-    vld1.u8         {d20, d21, d22, d23}, [r0], r1
-    vext.8          q5, q4, q5, #1
-    vld1.u8         {d24, d25, d26, d27}, [r0], r1
-    vext.8          q7, q6, q7, #1
-    vld1.u8         {d28, d29, d30, d31}, [r0], r1
-    vext.8          q9, q8, q9, #1
-    vext.8          q11, q10, q11, #1
-    vext.8          q13, q12, q13, #1
-    vext.8          q15, q14, q15, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-    vrhadd.u8       q5, q10, q11
-    vrhadd.u8       q6, q12, q13
-    vrhadd.u8       q7, q14, q15
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-
-    mov             r2, #2                  ;loop counter
-    vld1.u8         {d0, d1}, [r0], r1      ;load src data
-    mov             r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
-    vld1.u8         {d2, d3}, [r0], r1
-    vld1.u8         {d4, d5}, [r0], r1
-    vld1.u8         {d6, d7}, [r0], r1
-    vld1.u8         {d8, d9}, [r0], r1
-
-    vrhadd.u8       q0, q0, q1
-    vld1.u8         {d10, d11}, [r0], r1
-    vrhadd.u8       q1, q1, q2
-    vld1.u8         {d12, d13}, [r0], r1
-    vrhadd.u8       q2, q2, q3
-    vld1.u8         {d14, d15}, [r0], r1
-    vrhadd.u8       q3, q3, q4
-    vld1.u8         {d16, d17}, [r0], r1
-    vrhadd.u8       q4, q4, q5
-    vrhadd.u8       q5, q5, q6
-    vrhadd.u8       q6, q6, q7
-    vrhadd.u8       q7, q7, q8
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vmov            q0, q8
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r2, #4
-
-sub_pixel_variance16x16s_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q1}, [r4], r12
-    vld1.8          {q2}, [r3]!
-    vld1.8          {q3}, [r4], r12
-    vld1.8          {q4}, [r3]!
-    vld1.8          {q5}, [r4], r12
-    vld1.8          {q6}, [r3]!
-    vld1.8          {q7}, [r4], r12
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r2, r2, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             sub_pixel_variance16x16s_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    add             sp, sp, #256
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4, pc}
-    ENDP
-
-    END

diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
deleted file mode 100644
index 9d9f9e0..0000000
--- a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null

@@ -1,225 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sub_pixel_variance8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
-
-|vp8_sub_pixel_variance8x8_neon| PROC
-    push            {r4-r5, lr}
-    vpush           {d8-d15}
-
-    adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #76]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #80]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #84]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vld1.u32        {d31}, [r2]             ;load first_pass filter
-    vld1.u8         {q2}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {q3}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {q4}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
-    vld1.u8         {q2}, [r0], r1
-    vqrshrn.u16    d23, q7, #7
-    vld1.u8         {q3}, [r0], r1
-    vqrshrn.u16    d24, q8, #7
-    vld1.u8         {q4}, [r0], r1
-    vqrshrn.u16    d25, q9, #7
-
-    ;first_pass filtering on the rest 5-line data
-    vld1.u8         {q5}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-    vext.8          d11, d10, d11, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-    vmlal.u8        q10, d11, d1
-
-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d27, q7, #7
-    vqrshrn.u16    d28, q8, #7
-    vqrshrn.u16    d29, q9, #7
-    vqrshrn.u16    d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    ;skip_secondpass_filter
-    beq             sub_pixel_variance8x8_neon
-
-    add             r3, r12, r3, lsl #3
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vmull.u8        q2, d23, d0
-    vmull.u8        q3, d24, d0
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d24, d1
-    vmlal.u8        q3, d25, d1
-    vmlal.u8        q4, d26, d1
-    vmlal.u8        q5, d27, d1
-    vmlal.u8        q6, d28, d1
-    vmlal.u8        q7, d29, d1
-    vmlal.u8        q8, d30, d1
-
-    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d23, q2, #7
-    vqrshrn.u16    d24, q3, #7
-    vqrshrn.u16    d25, q4, #7
-    vqrshrn.u16    d26, q5, #7
-    vqrshrn.u16    d27, q6, #7
-    vqrshrn.u16    d28, q7, #7
-    vqrshrn.u16    d29, q8, #7
-
-    b               sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
-    vld1.u8         {d22}, [r0], r1         ;load src data
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-    vld1.u8         {d27}, [r0], r1
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-    b               secondpass_filter
-
-;----------------------
-;vp8_variance8x8_neon
-sub_pixel_variance8x8_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #2
-
-sub_pixel_variance8x8_neon_loop
-    vld1.8          {d0}, [r4], r5              ;load dst data
-    subs            r12, r12, #1
-    vld1.8          {d1}, [r4], r5
-    vld1.8          {d2}, [r4], r5
-    vsubl.u8        q4, d22, d0                 ;calculate diff
-    vld1.8          {d3}, [r4], r5
-
-    vsubl.u8        q5, d23, d1
-    vsubl.u8        q6, d24, d2
-
-    vpadal.s16      q8, q4                      ;sum
-    vmlal.s16       q9, d8, d8                  ;sse
-    vmlal.s16       q10, d9, d9
-
-    vsubl.u8        q7, d25, d3
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-
-    vmov            q11, q13
-
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-
-    vmov            q12, q14
-
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    bne             sub_pixel_variance8x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #6
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4-r5, pc}
-
-    ENDP
-
-;-----------------
-
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END

diff --git a/libvpx/vp8/common/arm/reconintra_arm.c b/libvpx/vp8/common/arm/reconintra_arm.c
deleted file mode 100644
index e55a33c..0000000
--- a/libvpx/vp8/common/arm/reconintra_arm.c
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_NEON_ASM
-extern void vp8_build_intra_predictors_mby_neon_func(
-    unsigned char *y_buffer,
-    unsigned char *ypred_ptr,
-    int y_stride,
-    int mode,
-    int Up,
-    int Left);
-
-void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
-{
-    unsigned char *y_buffer = x->dst.y_buffer;
-    unsigned char *ypred_ptr = x->predictor;
-    int y_stride = x->dst.y_stride;
-    int mode = x->mode_info_context->mbmi.mode;
-    int Up = x->up_available;
-    int Left = x->left_available;
-
-    vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-extern void vp8_build_intra_predictors_mby_s_neon_func(
-    unsigned char *y_buffer,
-    unsigned char *ypred_ptr,
-    int y_stride,
-    int mode,
-    int Up,
-    int Left);
-
-void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
-{
-    unsigned char *y_buffer = x->dst.y_buffer;
-    unsigned char *ypred_ptr = x->predictor;
-    int y_stride = x->dst.y_stride;
-    int mode = x->mode_info_context->mbmi.mode;
-    int Up = x->up_available;
-    int Left = x->left_available;
-
-    vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-#endif

diff --git a/libvpx/vp8/common/arm/variance_arm.c b/libvpx/vp8/common/arm/variance_arm.c
deleted file mode 100644
index e3f7083..0000000
--- a/libvpx/vp8/common/arm/variance_arm.c
+++ /dev/null

@@ -1,132 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/variance.h"
-#include "vp8/common/filter.h"
-
-#if HAVE_MEDIA
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-unsigned int vp8_sub_pixel_variance8x8_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[10*8];
-    unsigned char  second_pass[8*8];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            9, 8, HFilter);
-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             8, 8, 8, VFilter);
-
-    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
-                                   dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[36*16];
-    unsigned char  second_pass[20*16];
-    const short *HFilter, *VFilter;
-    unsigned int var;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else
-    {
-        HFilter = vp8_bilinear_filters[xoffset];
-        VFilter = vp8_bilinear_filters[yoffset];
-
-        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                                src_pixels_per_line,
-                                                17, 16, HFilter);
-        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                                 16, 16, 16, VFilter);
-
-        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                       dst_pixels_per_line, sse);
-    }
-    return var;
-}
-
-#endif /* HAVE_MEDIA */
-
-
-#if HAVE_NEON_ASM
-
-extern unsigned int vp8_sub_pixel_variance16x16_neon_func
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-
-unsigned int vp8_sub_pixel_variance16x16_neon
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-  if (xoffset == 4 && yoffset == 0)
-    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 0 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 4 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else
-    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif

diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
index ea1a6a4..192108a 100644
--- a/libvpx/vp8/common/blockd.h
+++ b/libvpx/vp8/common/blockd.h

@@ -187,8 +187,12 @@
 {
     FRAME_TYPE frame_type;
     int is_frame_dropped;
+    // The frame rate for the lowest resolution.
+    double low_res_framerate;
     /* The frame number of each reference frames */
     unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+    // The video frame counter value for the key frame, for lowest resolution.
+    unsigned int key_frame_counter_value;
     LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif

diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h
index 17262d6..ba3d9f5 100644
--- a/libvpx/vp8/common/common.h
+++ b/libvpx/vp8/common/common.h

@@ -29,19 +29,19 @@
 
 #define vp8_copy( Dest, Src) { \
         assert( sizeof( Dest) == sizeof( Src)); \
-        vpx_memcpy( Dest, Src, sizeof( Src)); \
+        memcpy( Dest, Src, sizeof( Src)); \
     }
 
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array( Dest, Src, N) { \
         assert( sizeof( *Dest) == sizeof( *Src)); \
-        vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+        memcpy( Dest, Src, N * sizeof( *Src)); \
     }
 
-#define vp8_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
 
-#define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
 
 
 #ifdef __cplusplus

diff --git a/libvpx/vp8/common/copy_c.c b/libvpx/vp8/common/copy_c.c
new file mode 100644
index 0000000..e339291
--- /dev/null
+++ b/libvpx/vp8/common/copy_c.c

@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "./vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+        memcpy(dst_ptr, src_ptr, 32);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}

diff --git a/libvpx/vp8/common/debugmodes.c b/libvpx/vp8/common/debugmodes.c
index 46064e6..159fddc 100644
--- a/libvpx/vp8/common/debugmodes.c
+++ b/libvpx/vp8/common/debugmodes.c

@@ -81,7 +81,6 @@
     fprintf(mvs, "\n");
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "Mbs for Frame %d\n", frame);
     {
         int b_row;
@@ -129,7 +128,6 @@
 
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "MVs for Frame %d\n", frame);
     {
         int b_row;

diff --git a/libvpx/vp8/common/dequantize.c b/libvpx/vp8/common/dequantize.c
index 6e2f69a..f8b04fa 100644
--- a/libvpx/vp8/common/dequantize.c
+++ b/libvpx/vp8/common/dequantize.c

@@ -38,6 +38,6 @@
 
     vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
 
-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);
 
 }

diff --git a/libvpx/vp8/common/entropy.c b/libvpx/vp8/common/entropy.c
index 8c046a4..c00e565 100644
--- a/libvpx/vp8/common/entropy.c
+++ b/libvpx/vp8/common/entropy.c

@@ -183,7 +183,6 @@
 
 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
-    vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
-                   sizeof(default_coef_probs));
+    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
 }
 

diff --git a/libvpx/vp8/common/entropymode.c b/libvpx/vp8/common/entropymode.c
index 091e4c7..8981a8d 100644
--- a/libvpx/vp8/common/entropymode.c
+++ b/libvpx/vp8/common/entropymode.c

@@ -159,13 +159,13 @@
 
 void vp8_init_mbmode_probs(VP8_COMMON *x)
 {
-    vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
-    vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
-    vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }
 
 void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
 {
-    vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }
 

diff --git a/libvpx/vp8/common/extend.c b/libvpx/vp8/common/extend.c
index c9bdd21..2d938ad 100644
--- a/libvpx/vp8/common/extend.c
+++ b/libvpx/vp8/common/extend.c

@@ -40,9 +40,9 @@
 
     for (i = 0; i < h; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], el);
-        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
-        vpx_memset(dest_ptr2, src_ptr2[0], er);
+        memset(dest_ptr1, src_ptr1[0], el);
+        memcpy(dest_ptr1 + el, src_ptr1, w);
+        memset(dest_ptr2, src_ptr2[0], er);
         src_ptr1  += sp;
         src_ptr2  += sp;
         dest_ptr1 += dp;
@@ -60,13 +60,13 @@
 
     for (i = 0; i < et; i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+        memcpy(dest_ptr1, src_ptr1, linesize);
         dest_ptr1 += dp;
     }
 
     for (i = 0; i < eb; i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+        memcpy(dest_ptr2, src_ptr2, linesize);
         dest_ptr2 += dp;
     }
 }

diff --git a/libvpx/vp8/common/filter.c b/libvpx/vp8/common/filter.c
index 25266f8..84c608e 100644
--- a/libvpx/vp8/common/filter.c
+++ b/libvpx/vp8/common/filter.c

@@ -10,6 +10,7 @@
 
 
 #include "filter.h"
+#include "./vp8_rtcd.h"
 
 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {

diff --git a/libvpx/vp8/common/generic/systemdependent.c b/libvpx/vp8/common/generic/systemdependent.c
index d84df33..28dc262 100644
--- a/libvpx/vp8/common/generic/systemdependent.c
+++ b/libvpx/vp8/common/generic/systemdependent.c

@@ -17,6 +17,7 @@
 #include "vpx_ports/x86.h"
 #endif
 #include "vp8/common/onyxc_int.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_MULTITHREAD
 #if HAVE_UNISTD_H && !defined(__OS2__)
@@ -44,6 +45,10 @@
 #endif
 #elif defined(_WIN32)
     {
+#if _WIN32_WINNT >= 0x0501
+        SYSTEM_INFO sysinfo;
+        GetNativeSystemInfo(&sysinfo);
+#else
         PGNSI pGNSI;
         SYSTEM_INFO sysinfo;
 
@@ -56,6 +61,7 @@
             pGNSI(&sysinfo);
         else
             GetSystemInfo(&sysinfo);
+#endif
 
         core_count = sysinfo.dwNumberOfProcessors;
     }

diff --git a/libvpx/vp8/common/idct_blk.c b/libvpx/vp8/common/idct_blk.c
index 65d5002..8aa7d9b 100644
--- a/libvpx/vp8/common/idct_blk.c
+++ b/libvpx/vp8/common/idct_blk.c

@@ -33,7 +33,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q   += 16;
@@ -59,7 +59,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;
@@ -78,7 +78,7 @@
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;

diff --git a/libvpx/vp8/common/idctllm.c b/libvpx/vp8/common/idctllm.c
index 47af52f..f5403c5 100644
--- a/libvpx/vp8/common/idctllm.c
+++ b/libvpx/vp8/common/idctllm.c

@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp8_rtcd.h"
 
 /****************************************************************************
  * Notes:

diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c
index 0693326..2bfefb1 100644
--- a/libvpx/vp8/common/mfqe.c
+++ b/libvpx/vp8/common/mfqe.c

@@ -17,10 +17,11 @@
  * higher quality.
  */
 
-#include "postproc.h"
-#include "variance.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp8_rtcd.h"
 #include "vpx_scale/yv12config.h"
 
 #include <limits.h>
@@ -150,36 +151,36 @@
 
     if (blksize == 16)
     {
-        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 128)>>8;
-        usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 32)>>6;
-        vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 32)>>6;
 #else
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
-        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
-        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+        usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+        vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
 #endif
     }
     else /* if (blksize == 8) */
     {
-        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 32)>>6;
-        usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 8)>>4;
-        vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 8)>>4;
 #else
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
-        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
-        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+        usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+        vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
 #endif
     }
 
@@ -231,9 +232,9 @@
         {
             vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
             for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
-                vpx_memcpy(udp, up, uvblksize);
+                memcpy(udp, up, uvblksize);
             for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
-                vpx_memcpy(vdp, vp, uvblksize);
+                memcpy(vdp, vp, uvblksize);
         }
     }
 }
@@ -341,8 +342,8 @@
                                 for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
                                                         vp += show->uv_stride, vdp += dest->uv_stride)
                                 {
-                                    vpx_memcpy(udp, up, 4);
-                                    vpx_memcpy(vdp, vp, 4);
+                                    memcpy(udp, up, 4);
+                                    memcpy(vdp, vp, 4);
                                 }
                             }
                         }

diff --git a/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
index 619ee80..fc3bb8a 100644
--- a/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
+++ b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c

@@ -26,7 +26,7 @@
 
     vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
 
-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);
 
 }
 

diff --git a/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c b/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
similarity index 100%
rename from libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
rename to libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c


diff --git a/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c b/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c
new file mode 100644
index 0000000..1054ed3
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c

@@ -0,0 +1,911 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) =
+{
+    { 112, 16 },
+    { 96, 32 },
+    { 80, 48 },
+    { 64, 64 },
+    { 48, 80 },
+    { 32, 96 },
+    { 16, 112 }
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
+{
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else
+    {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+    }
+}
+
+static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+    src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;)
+    {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else
+    {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    if (4 == height)
+    {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+    else if (8 == height)
+    {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *RESTRICT src,
+                                          int32_t src_stride,
+                                          uint8_t *RESTRICT dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_horiz,
+                                          const int8_t *filter_vert,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    if (4 == height)
+    {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+    else
+    {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 4);
+        }
+        else
+        {
+            common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+        }
+        else
+        {
+            uint32_t tp0, tp1, tp2, tp3;
+
+            LW4(src, src_stride, tp0, tp1, tp2, tp3);
+            SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 4);
+        }
+        else
+        {
+            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+        }
+        else
+        {
+            vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 8);
+        }
+        else
+        {
+            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
+        }
+        else
+        {
+            vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                   int32_t xoffset, int32_t yoffset,
+                                   uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+        }
+        else
+        {
+            common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter,
+                                 16);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter,
+                                 16);
+        }
+        else
+        {
+            vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
+        }
+    }
+}

diff --git a/libvpx/vp8/common/mips/msa/copymem_msa.c b/libvpx/vp8/common/mips/msa/copymem_msa.c
new file mode 100644
index 0000000..002a5ed
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/copymem_msa.c

@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void copy_8x4_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1, src2, src3;
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_8x8_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1, src2, src3;
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_16x16_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14,
+           src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
+           dst_stride);
+}
+
+void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    copy_16x16_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    copy_8x8_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    copy_8x4_msa(src, src_stride, dst, dst_stride);
+}

diff --git a/libvpx/vp8/common/mips/msa/idct_msa.c b/libvpx/vp8/common/mips/msa/idct_msa.c
new file mode 100644
index 0000000..e537a3f
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/idct_msa.c

@@ -0,0 +1,457 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                        \
+    v8i16 s4_m, s5_m, s6_m, s7_m;                                        \
+                                                                         \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);      \
+    ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2);                      \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m);                \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m);                \
+}
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)     \
+({                                                        \
+    v8i16 out_m;                                          \
+    v8i16 zero_m = { 0 };                                 \
+    v4i32 tmp1_m, tmp2_m;                                 \
+    v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);     \
+                                                          \
+    ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m);              \
+    tmp1_m >>= 16;                                        \
+    tmp2_m >>= 16;                                        \
+    tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16;             \
+    tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16;             \
+    out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m);  \
+                                                          \
+    out_m;                                                \
+})
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v8i16 a1_m, b1_m, c1_m, d1_m;                                  \
+    v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v8i16 const_cospi8sqrt2minus1_m;                               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1);   \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);     \
+    c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m);      \
+    c_tmp2_m = c_tmp2_m >> 1;                                      \
+    c_tmp2_m = in3 + c_tmp2_m;                                     \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m);      \
+    d_tmp1_m = d_tmp1_m >> 1;                                      \
+    d_tmp1_m = in1 + d_tmp1_m;                                     \
+    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);     \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);   \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                    \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16);    \
+    d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
+                               int32_t pred_stride,
+                               uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride,
+                                 uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
+    vec = __msa_fill_h(in_dc);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+}
+
+static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
+                                       uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
+    v16i8 dest0, dest1, dest2, dest3;
+    v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
+    v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+    v2i64 zero = { 0 };
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
+    PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
+    PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
+    PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
+    UNPCK_SH_SW(mul0, hz0_w, hz1_w);
+    UNPCK_SH_SW(mul1, hz2_w, hz3_w);
+    TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
+    VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
+                                          int16_t *dequant_input,
+                                          uint8_t *dest, int32_t dest_stride)
+{
+    v16u8 dest0, dest1, dest2, dest3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+    v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v8i16 res0, res1, res2, res3;
+    v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+    v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+    v16i8 zero = { 0 };
+
+    LD_SH4(input, 8, in0, in1, in2, in3);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
+         mul0, mul1, mul2, mul3);
+    PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
+    PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    UNPCK_SH_SW(hz0, hz0r, hz0l);
+    UNPCK_SH_SW(hz1, hz1r, hz1l);
+    UNPCK_SH_SW(hz2, hz2r, hz2l);
+    UNPCK_SH_SW(hz3, hz3r, hz3l);
+    VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+    SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
+    VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+    SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
+    PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
+                vt3);
+    TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+
+    __asm__ __volatile__(
+        "sw   $zero,    0(%[input])  \n\t"
+        "sw   $zero,    4(%[input])  \n\t"
+        "sw   $zero,    8(%[input])  \n\t"
+        "sw   $zero,   12(%[input])  \n\t"
+        "sw   $zero,   16(%[input])  \n\t"
+        "sw   $zero,   20(%[input])  \n\t"
+        "sw   $zero,   24(%[input])  \n\t"
+        "sw   $zero,   28(%[input])  \n\t"
+        "sw   $zero,   32(%[input])  \n\t"
+        "sw   $zero,   36(%[input])  \n\t"
+        "sw   $zero,   40(%[input])  \n\t"
+        "sw   $zero,   44(%[input])  \n\t"
+        "sw   $zero,   48(%[input])  \n\t"
+        "sw   $zero,   52(%[input])  \n\t"
+        "sw   $zero,   56(%[input])  \n\t"
+        "sw   $zero,   60(%[input])  \n\t"::
+
+        [input] "r"(input)
+    );
+}
+
+static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input_dc0, input_dc1, vec;
+    v16u8 dest0, dest1, dest2, dest3;
+    v16i8 zero = { 0 };
+    v8i16 res0, res1, res2, res3;
+
+    input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
+    input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
+    SRARI_H2_SH(input_dc0, input_dc1, 3);
+    vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
+    input[0] = 0;
+    input[16] = 0;
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
+               res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
+{
+    v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
+
+    LD_SH2(DQC, 8, dqc0, dqc1);
+    LD_SH2(d->qcoeff, 8, q0, q1);
+    MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
+    ST_SH2(dq0, dq1, d->dqcoeff, 8);
+}
+
+void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
+                              uint8_t *dest, int32_t stride)
+{
+    dequant_idct4x4_addblk_msa(input, dq, dest, stride);
+
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[input])     \n\t"
+        "sw     $zero,    4(%[input])     \n\t"
+        "sw     $zero,    8(%[input])     \n\t"
+        "sw     $zero,   12(%[input])     \n\t"
+        "sw     $zero,   16(%[input])     \n\t"
+        "sw     $zero,   20(%[input])     \n\t"
+        "sw     $zero,   24(%[input])     \n\t"
+        "sw     $zero,   28(%[input])     \n\t"
+
+        :
+        : [input] "r" (input)
+    );
+}
+
+void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
+                                      uint8_t *dst, int32_t stride,
+                                      char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+    uint8_t i;
+
+    for (i = 4; i--;)
+    {
+        if (eobs_h[0])
+        {
+            if (eobs_h[0] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst, stride);
+            }
+        }
+
+        q += 32;
+
+        if (eobs_h[1])
+        {
+            if (eobs_h[1] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
+            }
+        }
+
+        q += 32;
+        dst += (4 * stride);
+        eobs_h += 2;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
+                                       uint8_t *dstu, uint8_t *dstv,
+                                       int32_t stride, char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+
+    if (eobs_h[0])
+    {
+        if (eobs_h[0] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+    dstu += (stride * 4);
+
+    if (eobs_h[1])
+    {
+        if (eobs_h[1] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+
+    if (eobs_h[2])
+    {
+        if (eobs_h[2] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+
+    q += 32;
+    dstv += (stride * 4);
+
+    if (eobs_h[3])
+    {
+        if (eobs_h[3] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+}

diff --git a/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
new file mode 100644
index 0000000..a40f378
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c

@@ -0,0 +1,826 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)         \
+{                                                              \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = ((v16u8)mask <= b_limit);                           \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                        \
+    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                        \
+    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                        \
+    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                        \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                        \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                        \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                        \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                        \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)          \
+{                                                                  \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;       \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;        \
+                                                                   \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                       \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+                                                                   \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                         \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                         \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)(mask);                                   \
+                                                                   \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                          \
+    filt1 >>= 3;                                                   \
+                                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                          \
+    filt2 >>= 3;                                                   \
+                                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)            \
+{                                                                  \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                      \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                         \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;              \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;      \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                       \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+                                                                   \
+    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                          \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                 \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                 \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)mask;                                     \
+    filt2 = filt & (v16i8)hev;                                     \
+                                                                   \
+    hev = __msa_xori_b(hev, 0xff);                                 \
+    filt = filt & (v16i8)hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                         \
+    filt1 >>= 3;                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                         \
+    filt2 >>= 3;                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+                                                                   \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                  \
+                                                                   \
+    cnst27h = __msa_ldi_h(27);                                     \
+    cnst63h = __msa_ldi_h(63);                                     \
+                                                                   \
+    u_r = filt_r * cnst27h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+    u_l = filt_l * cnst27h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q0_m = __msa_subs_s_b(q0_m, u);                                \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                     \
+    u_r = filt_r * cnst18h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l * cnst18h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q1_m = __msa_subs_s_b(q1_m, u);                                \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                             \
+    u_r += filt_r + cnst63h;                                       \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l << 3;                                             \
+    u_l += filt_l + cnst63h;                                       \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q2_m = __msa_subs_s_b(q2_m, u);                                \
+    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                \
+    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8)flat_out;                       \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8)mask_out;                       \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx);               \
+    tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx);               \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+
+static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t b_limit_in,
+                                                const uint8_t limit_in,
+                                                const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                                 int32_t pitch,
+                                                 const uint8_t b_limit_in,
+                                                 const uint8_t limit_in,
+                                                 const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 1);
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    q2_d = __msa_copy_u_d((v2i64)q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t b_limit_in,
+                                              const uint8_t limit_in,
+                                              const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t *b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
+    tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y,
+                                        *lpf_info_ptr->mblim,
+                                        *lpf_info_ptr->lim,
+                                        *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_horizontal_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                             *lpf_info_ptr->mblim,
+                                             *lpf_info_ptr->lim,
+                                             *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_vertical_edge_y_msa(src_y, pitch_y,
+                                      *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                           *lpf_info_ptr->mblim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_horizontal_edge_uv_msa(src_u + (4 * pitch_u_v),
+                                           src_v + (4 * pitch_u_v),
+                                           pitch_u_v,
+                                           *lpf_info_ptr->blim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
+                                         *lpf_info_ptr->blim,
+                                         *lpf_info_ptr->lim,
+                                         *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+}
+
+void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
+}

diff --git a/libvpx/vp8/common/mips/msa/mfqe_msa.c b/libvpx/vp8/common/mips/msa/mfqe_msa.c
new file mode 100644
index 0000000..3e7629f
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/mfqe_msa.c

@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight)
+{
+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int32_t row;
+    uint64_t src0_d, src1_d, dst0_d, dst1_d;
+    v16i8 src0 = { 0 };
+    v16i8 src1 = { 0 };
+    v16i8 dst0 = { 0 };
+    v16i8 dst1 = { 0 };
+    v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+    src_wt = __msa_fill_h(src_weight);
+    dst_wt = __msa_fill_h(dst_weight);
+
+    for (row = 2; row--;)
+    {
+        LD2(src_ptr, src_stride, src0_d, src1_d);
+        src_ptr += (2 * src_stride);
+        LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+        INSERT_D2_SB(src0_d, src1_d, src0);
+        INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+        LD2(src_ptr, src_stride, src0_d, src1_d);
+        src_ptr += (2 * src_stride);
+        LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+        INSERT_D2_SB(src0_d, src1_d, src1);
+        INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+        UNPCK_UB_SH(src0, src_r, src_l);
+        UNPCK_UB_SH(dst0, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+        ST8x2_UB(dst0, dst_ptr, dst_stride);
+        dst_ptr += (2 * dst_stride);
+
+        UNPCK_UB_SH(src1, src_r, src_l);
+        UNPCK_UB_SH(dst1, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+        ST8x2_UB(dst1, dst_ptr, dst_stride);
+        dst_ptr += (2 * dst_stride);
+    }
+}
+
+static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+                                      uint8_t *dst_ptr, int32_t dst_stride,
+                                      int32_t src_weight)
+{
+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int32_t row;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 src_wt, dst_wt;
+    v8i16 res_h_r, res_h_l;
+    v8i16 src_r, src_l, dst_r, dst_l;
+
+    src_wt = __msa_fill_h(src_weight);
+    dst_wt = __msa_fill_h(dst_weight);
+
+    for (row = 4; row--;)
+    {
+        LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+        UNPCK_UB_SH(src0, src_r, src_l);
+        UNPCK_UB_SH(dst0, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src1, src_r, src_l);
+        UNPCK_UB_SH(dst1, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src2, src_r, src_l);
+        UNPCK_UB_SH(dst2, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src3, src_r, src_l);
+        UNPCK_UB_SH(dst3, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+    }
+}
+
+void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+                                   uint8_t *dst_ptr, int32_t dst_stride,
+                                   int32_t src_weight)
+{
+    filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
+                              src_weight);
+}
+
+void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+                                 uint8_t *dst_ptr, int32_t dst_stride,
+                                 int32_t src_weight)
+{
+    filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,
+                            src_weight);
+}

diff --git a/libvpx/vp8/common/mips/msa/postproc_msa.c b/libvpx/vp8/common/mips/msa/postproc_msa.c
new file mode 100644
index 0000000..c88f302
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/postproc_msa.c

@@ -0,0 +1,851 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int16_t vp8_rv_msa[] =
+{
+    8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+    0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+    10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+    8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+    1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+    3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+    11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+    14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+    4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+    7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+    11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+    14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+    5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                out0, out1, out2, out3,                  \
+                                out4, out5, out6, out7,                  \
+                                out8, out9, out10, out11,                \
+                                out12, out13, out14, out15)              \
+{                                                                        \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                             \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                             \
+                                                                         \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                             \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                             \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                              \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                             \
+    out0 = (v16u8)temp6;                                                 \
+    out2 = (v16u8)temp7;                                                 \
+    out4 = (v16u8)temp8;                                                 \
+    out6 = (v16u8)temp9;                                                 \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);             \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);             \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);             \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                \
+}
+
+#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in,    \
+                           below1_in, below2_in, ref, out)  \
+{                                                           \
+    v16u8 temp0, temp1;                                     \
+                                                            \
+    temp1 = __msa_aver_u_b(above2_in, above1_in);           \
+    temp0 = __msa_aver_u_b(below2_in, below1_in);           \
+    temp1 = __msa_aver_u_b(temp1, temp0);                   \
+    out = __msa_aver_u_b(src_in, temp1);                    \
+    temp0 = __msa_asub_u_b(src_in, above2_in);              \
+    temp1 = __msa_asub_u_b(src_in, above1_in);              \
+    temp0 = (temp0 < ref);                                  \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below1_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below2_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    out = __msa_bmz_v(out, src_in, temp0);                  \
+}
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                         in8, in9, in10, in11, in12, in13, in14, in15)  \
+{                                                                       \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                            \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                            \
+                                                                        \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                            \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                            \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                            \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                            \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                            \
+    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                   \
+    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                            \
+    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                            \
+    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                            \
+    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                       \
+    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                   \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);              \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);              \
+    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                   \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);              \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);              \
+    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14,            \
+               temp2, temp3, temp4, temp5);                             \
+    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4,  \
+               temp6, temp7, temp8, temp9);                             \
+    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);               \
+    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);              \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);              \
+    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);               \
+    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);             \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);             \
+}
+
+#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5,    \
+                                in6, in7, in8, in9, in10, in11)  \
+{                                                                \
+    v8i16 temp0, temp1, temp2, temp3;                            \
+    v8i16 temp4, temp5, temp6, temp7;                            \
+                                                                 \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                     \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                     \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                     \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                     \
+    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                \
+    temp4 = __msa_ilvr_h(temp5, temp4);                          \
+    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                \
+    temp5 = __msa_ilvr_h(temp7, temp6);                          \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                     \
+    in0 = (v16u8)temp0;                                          \
+    in2 = (v16u8)temp1;                                          \
+    in4 = (v16u8)temp2;                                          \
+    in6 = (v16u8)temp3;                                          \
+    in8 = (v16u8)temp6;                                          \
+    in10 = (v16u8)temp7;                                         \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);       \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);       \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);       \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);       \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);       \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);      \
+}
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                            int32_t src_stride,
+                                            int32_t dst_stride,
+                                            int32_t cols, uint8_t *f)
+{
+    uint8_t *p_src = src_ptr;
+    uint8_t *p_dst = dst_ptr;
+    uint8_t *f_orig = f;
+    uint8_t *p_dst_st = dst_ptr;
+    uint16_t col;
+    uint64_t out0, out1, out2, out3;
+    v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+    v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+    v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+    for (col = (cols / 16); col--;)
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+               p_dst, dst_stride);
+
+        p_dst += 16;
+        p_src += 16;
+        f += 16;
+    }
+
+    if (0 != (cols / 16))
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        out0 = __msa_copy_u_d((v2i64)inter0, 0);
+        out1 = __msa_copy_u_d((v2i64)inter1, 0);
+        out2 = __msa_copy_u_d((v2i64)inter2, 0);
+        out3 = __msa_copy_u_d((v2i64)inter3, 0);
+        SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+        out0 = __msa_copy_u_d((v2i64)inter4, 0);
+        out1 = __msa_copy_u_d((v2i64)inter5, 0);
+        out2 = __msa_copy_u_d((v2i64)inter6, 0);
+        out3 = __msa_copy_u_d((v2i64)inter7, 0);
+        SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+    }
+
+    f = f_orig;
+    p_dst = dst_ptr - 2;
+    LD_UB8(p_dst, dst_stride,
+           inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
+
+    for (col = 0; col < (cols / 8); ++col)
+    {
+        ref = LD_UB(f);
+        f += 8;
+        VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
+                                inter4, inter5, inter6, inter7,
+                                inter8, inter9, inter10, inter11);
+        if (0 == col)
+        {
+            above2 = inter2;
+            above1 = inter2;
+        }
+        else
+        {
+            above2 = inter0;
+            above1 = inter1;
+        }
+        src = inter2;
+        below1 = inter3;
+        below2 = inter4;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter2);
+        above2 = inter5;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter3);
+        above1 = inter6;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter4);
+        src = inter7;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
+                           ref_temp, inter5);
+        below1 = inter8;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
+                           ref_temp, inter6);
+        below2 = inter9;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter7);
+        if (col == (cols / 8 - 1))
+        {
+            above2 = inter9;
+        }
+        else
+        {
+            above2 = inter10;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter8);
+        if (col == (cols / 8 - 1))
+        {
+            above1 = inter9;
+        }
+        else
+        {
+            above1 = inter11;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter9);
+        TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+                           inter8, inter9, inter2, inter3, inter4, inter5,
+                           inter6, inter7, inter8, inter9);
+        p_dst += 8;
+        LD_UB2(p_dst, dst_stride, inter0, inter1);
+        ST8x1_UB(inter2, p_dst_st);
+        ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+        LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+        ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+        ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+        LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+        ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+        ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+        LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+        ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+        ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+        p_dst_st += 8;
+    }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                          int32_t src_stride,
+                                          int32_t dst_stride,
+                                          int32_t cols, uint8_t *f)
+{
+    uint8_t *p_src = src_ptr;
+    uint8_t *p_dst = dst_ptr;
+    uint8_t *p_dst_st = dst_ptr;
+    uint8_t *f_orig = f;
+    uint16_t col;
+    v16u8 above2, above1, below2, below1;
+    v16u8 src, ref, ref_temp;
+    v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+    v16u8 inter7, inter8, inter9, inter10, inter11;
+    v16u8 inter12, inter13, inter14, inter15;
+
+    for (col = (cols / 16); col--;)
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        src = LD_UB(p_src + 10 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+        below1 = LD_UB(p_src + 11 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+        below2 = LD_UB(p_src + 12 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+        above2 = LD_UB(p_src + 13 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+        above1 = LD_UB(p_src + 14 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+        src = LD_UB(p_src + 15 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+        below1 = LD_UB(p_src + 16 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+        below2 = LD_UB(p_src + 17 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+        ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+               p_dst, dst_stride);
+        ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
+               inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
+        p_src += 16;
+        p_dst += 16;
+        f += 16;
+    }
+
+    f = f_orig;
+    p_dst = dst_ptr - 2;
+    LD_UB8(p_dst, dst_stride,
+           inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
+    LD_UB8(p_dst + 8 * dst_stride, dst_stride,
+           inter8, inter9, inter10, inter11, inter12, inter13,
+           inter14, inter15);
+
+    for (col = 0; col < cols / 8; ++col)
+    {
+        ref = LD_UB(f);
+        f += 8;
+        TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
+                         inter6, inter7, inter8, inter9, inter10, inter11,
+                         inter12, inter13, inter14, inter15);
+        if (0 == col)
+        {
+            above2 = inter2;
+            above1 = inter2;
+        }
+        else
+        {
+            above2 = inter0;
+            above1 = inter1;
+        }
+
+        src = inter2;
+        below1 = inter3;
+        below2 = inter4;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter2);
+        above2 = inter5;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter3);
+        above1 = inter6;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter4);
+        src = inter7;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
+                           ref_temp, inter5);
+        below1 = inter8;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
+                           ref_temp, inter6);
+        below2 = inter9;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter7);
+        if (col == (cols / 8 - 1))
+        {
+            above2 = inter9;
+        }
+        else
+        {
+            above2 = inter10;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter8);
+        if (col == (cols / 8 - 1))
+        {
+            above1 = inter9;
+        }
+        else
+        {
+            above1 = inter11;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter9);
+        VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
+                                inter6, inter7, inter8, inter9,
+                                inter2, inter3, inter4, inter5,
+                                inter6, inter7, inter8, inter9,
+                                inter10, inter11, inter12, inter13,
+                                inter14, inter15, above2, above1);
+
+        p_dst += 8;
+        LD_UB2(p_dst, dst_stride, inter0, inter1);
+        ST8x1_UB(inter2, p_dst_st);
+        ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+        LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+        ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+        ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+        LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+        ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+        ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+        LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+        ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+        ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+        LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+        ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+        ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+        LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+        ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+        ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+        LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+        ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+        ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+        LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+        ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+        ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+        p_dst_st += 8;
+    }
+}
+
+void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t src_stride,
+                                              int32_t dst_stride,
+                                              int32_t cols, uint8_t *f,
+                                              int32_t size)
+{
+    if (8 == size)
+    {
+        postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
+                                        cols, f);
+    }
+    else if (16 == size)
+    {
+        postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
+                                      cols, f);
+    }
+}
+
+void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+                                   int32_t rows, int32_t cols, int32_t flimit)
+{
+    int32_t row, col, cnt;
+    uint8_t *src_dup = src_ptr;
+    v16u8 src0, src, tmp_orig;
+    v16u8 tmp = { 0 };
+    v16i8 zero = { 0 };
+    v8u16 sum_h, src_r_h, src_l_h;
+    v4u32 src_r_w, src_l_w;
+    v4i32 flimit_vec;
+
+    flimit_vec = __msa_fill_w(flimit);
+    for (row = rows; row--;)
+    {
+        int32_t sum_sq = 0;
+        int32_t sum = 0;
+        src0 = (v16u8)__msa_fill_b(src_dup[0]);
+        ST8x1_UB(src0, (src_dup - 8));
+
+        src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
+        ST_UB(src0, src_dup + cols);
+        src_dup[cols + 16] = src_dup[cols - 1];
+        tmp_orig = (v16u8)__msa_ldi_b(0);
+        tmp_orig[15] = tmp[15];
+        src = LD_UB(src_dup - 8);
+        src[15] = 0;
+        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+        src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+        sum_sq = HADD_SW_S32(src_r_w);
+        sum_sq += HADD_SW_S32(src_l_w);
+        sum_h = __msa_hadd_u_h(src, src);
+        sum = HADD_UH_U32(sum_h);
+        {
+            v16u8 src7, src8, src_r, src_l;
+            v16i8 mask;
+            v8u16 add_r, add_l;
+            v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+            v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+            v4i32 sub0, sub1, sub2, sub3;
+            v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+            v4i32 mul0, mul1, mul2, mul3;
+            v4i32 total0, total1, total2, total3;
+            v8i16 const8 = __msa_fill_h(8);
+
+            src7 = LD_UB(src_dup + 7);
+            src8 = LD_UB(src_dup - 8);
+            for (col = 0; col < (cols >> 4); ++col)
+            {
+                ILVRL_B2_UB(src7, src8, src_r, src_l);
+                HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+                sum_r[0] = sum + sub_r[0];
+                for (cnt = 0; cnt < 7; ++cnt)
+                {
+                    sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+                }
+                sum_l[0] = sum_r[7] + sub_l[0];
+                for (cnt = 0; cnt < 7; ++cnt)
+                {
+                    sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+                }
+                sum = sum_l[7];
+                src = LD_UB(src_dup + 16 * col);
+                ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+                src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+                src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+                tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
+
+                HADD_UB2_UH(src_r, src_l, add_r, add_l);
+                UNPCK_SH_SW(sub_r, sub0, sub1);
+                UNPCK_SH_SW(sub_l, sub2, sub3);
+                ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+                ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+                MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
+                     mul0, mul1, mul2, mul3);
+                sum_sq0[0] = sum_sq + mul0[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+                }
+                sum_sq1[0] = sum_sq0[3] + mul1[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+                }
+                sum_sq2[0] = sum_sq1[3] + mul2[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+                }
+                sum_sq3[0] = sum_sq2[3] + mul3[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+                }
+                sum_sq = sum_sq3[3];
+
+                UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+                UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+                total0 = sum_sq0 * __msa_ldi_w(15);
+                total0 -= sum0_w * sum0_w;
+                total1 = sum_sq1 * __msa_ldi_w(15);
+                total1 -= sum1_w * sum1_w;
+                total2 = sum_sq2 * __msa_ldi_w(15);
+                total2 -= sum2_w * sum2_w;
+                total3 = sum_sq3 * __msa_ldi_w(15);
+                total3 -= sum3_w * sum3_w;
+                total0 = (total0 < flimit_vec);
+                total1 = (total1 < flimit_vec);
+                total2 = (total2 < flimit_vec);
+                total3 = (total3 < flimit_vec);
+                PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+                mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+                tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+
+                if (col == 0)
+                {
+                    uint64_t src_d;
+
+                    src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
+                    SD(src_d, (src_dup - 8));
+                }
+
+                src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+                src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+                ST_UB(tmp, (src_dup + (16 * col)));
+            }
+
+            src_dup += pitch;
+        }
+    }
+}
+
+void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+                              int32_t cols, int32_t flimit)
+{
+    int32_t row, col, cnt, i;
+    const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
+    v4i32 flimit_vec;
+    v16u8 dst7, dst8, dst_r_b, dst_l_b;
+    v16i8 mask;
+    v8u16 add_r, add_l;
+    v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+    v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+    flimit_vec = __msa_fill_w(flimit);
+
+    for (col = 0; col < (cols >> 4); ++col)
+    {
+        uint8_t *dst_tmp = &dst_ptr[col << 4];
+        v16u8 dst;
+        v16i8 zero = { 0 };
+        v16u8 tmp[16];
+        v8i16 mult0, mult1, rv2_0, rv2_1;
+        v8i16 sum0_h = { 0 };
+        v8i16 sum1_h = { 0 };
+        v4i32 mul0 = { 0 };
+        v4i32 mul1 = { 0 };
+        v4i32 mul2 = { 0 };
+        v4i32 mul3 = { 0 };
+        v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+        v4i32 add0, add1, add2, add3;
+        const int16_t *rv2[16];
+
+        dst = LD_UB(dst_tmp);
+        for (cnt = (col << 4), i = 0; i < 16; ++cnt)
+        {
+            rv2[i] = rv3 + ((cnt * 17) & 127);
+            ++i;
+        }
+        for (cnt = -8; cnt < 0; ++cnt)
+        {
+            ST_UB(dst, dst_tmp + cnt * pitch);
+        }
+
+        dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+        for (cnt = rows; cnt < rows + 17; ++cnt)
+        {
+            ST_UB(dst, dst_tmp + cnt * pitch);
+        }
+        for (cnt = -8; cnt <= 6; ++cnt)
+        {
+            dst = LD_UB(dst_tmp + (cnt * pitch));
+            UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+            MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+            mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+            mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+            mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+            mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
+            ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+        }
+
+        for (row = 0; row < (rows + 8); ++row)
+        {
+            for (i = 0; i < 8; ++i)
+            {
+                rv2_0[i] = *(rv2[i] + (row & 127));
+                rv2_1[i] = *(rv2[i + 8] + (row & 127));
+            }
+            dst7 = LD_UB(dst_tmp + (7 * pitch));
+            dst8 = LD_UB(dst_tmp - (8 * pitch));
+            ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+            HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+            UNPCK_SH_SW(sub_r, sub0, sub1);
+            UNPCK_SH_SW(sub_l, sub2, sub3);
+            sum0_h += sub_r;
+            sum1_h += sub_l;
+
+            HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+            ILVRL_H2_SW(zero, add_r, add0, add1);
+            ILVRL_H2_SW(zero, add_l, add2, add3);
+            mul0 += add0 * sub0;
+            mul1 += add1 * sub1;
+            mul2 += add2 * sub2;
+            mul3 += add3 * sub3;
+            dst = LD_UB(dst_tmp);
+            ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+            dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+            dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+            tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
+
+            UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+            UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+            total0 = mul0 * __msa_ldi_w(15);
+            total0 -= sum0_w * sum0_w;
+            total1 = mul1 * __msa_ldi_w(15);
+            total1 -= sum1_w * sum1_w;
+            total2 = mul2 * __msa_ldi_w(15);
+            total2 -= sum2_w * sum2_w;
+            total3 = mul3 * __msa_ldi_w(15);
+            total3 -= sum3_w * sum3_w;
+            total0 = (total0 < flimit_vec);
+            total1 = (total1 < flimit_vec);
+            total2 = (total2 < flimit_vec);
+            total3 = (total3 < flimit_vec);
+            PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+            mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+            tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
+
+            if (row >= 8)
+            {
+                ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+            }
+
+            dst_tmp += pitch;
+        }
+    }
+}
+
+void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16],
+                             uint32_t width, uint32_t height,
+                             int32_t pitch)
+{
+    uint32_t i, j;
+
+    for (i = 0; i < height / 2; ++i)
+    {
+        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
+        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
+        for (j = width / 16; j--;)
+        {
+            v16i8 temp00_s, temp01_s;
+            v16u8 temp00, temp01, black_clamp, white_clamp;
+            v16u8 pos0, ref0, pos1, ref1;
+            v16i8 const127 = __msa_ldi_b(127);
+
+            pos0 = LD_UB(pos0_ptr);
+            ref0 = LD_UB(ref0_ptr);
+            pos1 = LD_UB(pos1_ptr);
+            ref1 = LD_UB(ref1_ptr);
+            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+            temp00 = (pos0 < black_clamp);
+            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+            temp01 = (pos1 < black_clamp);
+            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+            XORI_B2_128_UB(pos0, pos1);
+            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+            temp00 = (v16u8)(temp00_s < pos0);
+            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+            temp01 = (temp01_s < pos1);
+            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+            XORI_B2_128_UB(pos0, pos1);
+            pos0 += ref0;
+            ST_UB(pos0, pos0_ptr);
+            pos1 += ref1;
+            ST_UB(pos1, pos1_ptr);
+            pos0_ptr += 16;
+            pos1_ptr += 16;
+            ref0_ptr += 16;
+            ref1_ptr += 16;
+        }
+    }
+}

diff --git a/libvpx/vp8/common/mips/msa/reconintra_msa.c b/libvpx/vp8/common/mips/msa/reconintra_msa.c
new file mode 100644
index 0000000..57f705d
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/reconintra_msa.c

@@ -0,0 +1,342 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride)
+{
+    uint64_t out = LD(src);
+
+    SD4(out, out, out, out, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    v16u8 out = LD_UB(src);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
+                                        uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    out0 = src[0 * src_stride] * 0x0101010101010101ull;
+    out1 = src[1 * src_stride] * 0x0101010101010101ull;
+    out2 = src[2 * src_stride] * 0x0101010101010101ull;
+    out3 = src[3 * src_stride] * 0x0101010101010101ull;
+    out4 = src[4 * src_stride] * 0x0101010101010101ull;
+    out5 = src[5 * src_stride] * 0x0101010101010101ull;
+    out6 = src[6 * src_stride] * 0x0101010101010101ull;
+    out7 = src[7 * src_stride] * 0x0101010101010101ull;
+
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16u8 src0, src1, src2, src3;
+
+    for (row = 4; row--;)
+    {
+        inp0 = src[0];
+        src += src_stride;
+        inp1 = src[0];
+        src += src_stride;
+        inp2 = src[0];
+        src += src_stride;
+        inp3 = src[0];
+        src += src_stride;
+
+        src0 = (v16u8)__msa_fill_b(inp0);
+        src1 = (v16u8)__msa_fill_b(inp1);
+        src2 = (v16u8)__msa_fill_b(inp2);
+        src3 = (v16u8)__msa_fill_b(inp3);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
+                                     int32_t src_stride_left,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row, addition = 0;
+    uint64_t out;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above)
+    {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32)sum, 0);
+
+        for (row = 0; row < 8; ++row)
+        {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        store = (v16u8)__msa_fill_b(addition);
+    }
+    else if (is_left)
+    {
+        for (row = 0; row < 8; ++row)
+        {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 4) >> 3;
+        store = (v16u8)__msa_fill_b(addition);
+    }
+    else if (is_above)
+    {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64)__msa_srari_d((v2i64)sum, 3);
+        store = (v16u8)__msa_splati_b((v16i8)sum, 0);
+    }
+    else
+    {
+        store = (v16u8)__msa_ldi_b(128);
+    }
+
+    out = __msa_copy_u_d((v2i64)store, 0);
+
+    SD4(out, out, out, out, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
+                                       int32_t src_stride_left,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row;
+    uint32_t addition = 0;
+    v16u8 src_above, out;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above)
+    {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32)sum, 0);
+
+        for (row = 0; row < 16; ++row)
+        {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 16) >> 5;
+        out = (v16u8)__msa_fill_b(addition);
+    }
+    else if (is_left)
+    {
+        for (row = 0; row < 16; ++row)
+        {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        out = (v16u8)__msa_fill_b(addition);
+    }
+    else if (is_above)
+    {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64)__msa_srari_d((v2i64)sum, 4);
+        out = (v16u8)__msa_splati_b((v16i8)sum, 0);
+    }
+    else
+    {
+        out = (v16u8)__msa_ldi_b(128);
+    }
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+void vp8_build_intra_predictors_mby_s_msa(struct macroblockd *x,
+                                          unsigned char *yabove_row,
+                                          unsigned char *yleft,
+                                          int left_stride,
+                                          unsigned char *ypred_ptr,
+                                          int y_stride)
+{
+    uint32_t row, col;
+    uint8_t ytop_left = yabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.mode)
+    {
+        case DC_PRED:
+            intra_predict_dc_16x16_msa(yabove_row, yleft, left_stride,
+                                       ypred_ptr, y_stride,
+                                       x->up_available, x->left_available);
+            break;
+
+        case V_PRED:
+            intra_predict_vert_16x16_msa(yabove_row, ypred_ptr, y_stride);
+            break;
+
+        case H_PRED:
+            intra_predict_horiz_16x16_msa(yleft, left_stride, ypred_ptr,
+                                          y_stride);
+            break;
+
+        case TM_PRED:
+            for (row = 0; row < 16; ++row)
+            {
+                for (col = 0; col < 16; ++col)
+                {
+                    int pred = yleft[row * left_stride] + yabove_row[col] -
+                               ytop_left;
+
+                    if (pred < 0)
+                        pred = 0;
+
+                    if (pred > 255)
+                        pred = 255;
+
+                    ypred_ptr[col] = pred;
+                }
+
+                ypred_ptr += y_stride;
+            }
+            break;
+
+        case B_PRED:
+        case NEARESTMV:
+        case NEARMV:
+        case ZEROMV:
+        case NEWMV:
+        case SPLITMV:
+        case MB_MODE_COUNT:
+            break;
+    }
+}
+
+void vp8_build_intra_predictors_mbuv_s_msa(struct macroblockd *x,
+                                           unsigned char *uabove_row,
+                                           unsigned char *vabove_row,
+                                           unsigned char *uleft,
+                                           unsigned char *vleft,
+                                           int left_stride,
+                                           unsigned char *upred_ptr,
+                                           unsigned char *vpred_ptr,
+                                           int pred_stride)
+{
+    uint32_t row, col;
+    uint8_t utop_left = uabove_row[-1];
+    uint8_t vtop_left = vabove_row[-1];
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+        case DC_PRED:
+            intra_predict_dc_8x8_msa(uabove_row, uleft, left_stride,
+                                     upred_ptr, pred_stride,
+                                     x->up_available, x->left_available);
+            intra_predict_dc_8x8_msa(vabove_row, vleft, left_stride,
+                                     vpred_ptr, pred_stride,
+                                     x->up_available, x->left_available);
+            break;
+
+        case V_PRED:
+            intra_predict_vert_8x8_msa(uabove_row, upred_ptr, pred_stride);
+            intra_predict_vert_8x8_msa(vabove_row, vpred_ptr, pred_stride);
+            break;
+
+        case H_PRED:
+            intra_predict_horiz_8x8_msa(uleft, left_stride, upred_ptr,
+                                        pred_stride);
+            intra_predict_horiz_8x8_msa(vleft, left_stride, vpred_ptr,
+                                        pred_stride);
+            break;
+
+        case TM_PRED:
+            for (row = 0; row < 8; ++row)
+            {
+                for (col = 0; col < 8; ++col)
+                {
+                    int predu = uleft[row * left_stride] + uabove_row[col] -
+                                utop_left;
+                    int predv = vleft[row * left_stride] + vabove_row[col] -
+                                vtop_left;
+
+                    if (predu < 0)
+                        predu = 0;
+
+                    if (predu > 255)
+                        predu = 255;
+
+                    if (predv < 0)
+                        predv = 0;
+
+                    if (predv > 255)
+                        predv = 255;
+
+                    upred_ptr[col] = predu;
+                    vpred_ptr[col] = predv;
+                }
+
+                upred_ptr += pred_stride;
+                vpred_ptr += pred_stride;
+            }
+            break;
+
+        case B_PRED:
+        case NEARESTMV:
+        case NEARMV:
+        case ZEROMV:
+        case NEWMV:
+        case SPLITMV:
+        case MB_MODE_COUNT:
+            break;
+    }
+}

diff --git a/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
new file mode 100644
index 0000000..fb60fc1
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c

@@ -0,0 +1,1850 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) =
+{
+    { 0, -6, 123, 12, -1, 0, 0, 0 },
+    { 2, -11, 108, 36, -8, 1, 0, 0 },  /* New 1/4 pel 6 tap filter */
+    { 0, -9, 93, 50, -6, 0, 0, 0 },
+    { 3, -16, 77, 77, -16, 3, 0, 0 },  /* New 1/2 pel 6 tap filter */
+    { 0, -6, 50, 93, -9, 0, 0, 0 },
+    { 1, -8, 36, 108, -11, 2, 0, 0 },  /* New 1/4 pel 6 tap filter */
+    { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
+{
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+({                                                                       \
+    v16i8 vec0_m, vec1_m, vec2_m;                                        \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+})
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, mask2,                \
+                                   filt0, filt1, filt2,                \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
+}
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2,                       \
+                                   filt0, filt1, filt2,                       \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)         \
+({                                                            \
+    v8i16 tmp0;                                               \
+                                                              \
+    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
+                                                              \
+    tmp0;                                                     \
+})
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+({                                                                     \
+    v16i8 vec0_m, vec1_m;                                              \
+    v8i16 hz_out_m;                                                    \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);              \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
+                                                                       \
+    hz_out_m;                                                          \
+})
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
+               src21_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
+               src32_r, src43_r, src21_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
+               src32_l, src43_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 tmp0, tmp1;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (2 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src7, src8);
+        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,  hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= (2 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
+    src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
+        src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
+        src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (1 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+    vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B2_128_SB(src3, src4);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out0, out1;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= (1 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+        vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        vec0 = vec4;
+        vec2 = vec1;
+    }
+}
+
+static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 res0, res1, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (2 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+        XORI_B2_128_UB(res0, res1);
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+    v16u8 out0, out1;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= (2 + src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v16u8 out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    src -= (1 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+        out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 vec0, vec1;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= (1 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_4w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 4);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_4w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 4);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                {
+                uint32_t tp0, tp1, tp2, tp3;
+
+                LW4(src, src_stride, tp0, tp1, tp2, tp3);
+                SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+                break;
+                }
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_4w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter, 4);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_4w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 4);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 4);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 4);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter, 4);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 4);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 8);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 8);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 8);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 8);
+                            break;
+                    }
+                    break;
+                }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 8);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 8);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter,
+                                    8);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 8);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter,
+                                                      v_filter, 16);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter,
+                                                      v_filter + 1, 16);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter + 1,
+                                                      v_filter, 16);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter + 1,
+                                                      v_filter + 1, 16);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_16w_msa(src, src_stride, dst, dst_stride,
+                                         v_filter, 16);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_16w_msa(src, src_stride, dst, dst_stride,
+                                         v_filter + 1, 16);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_16w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, 16);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_16w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter + 1, 16);
+                break;
+        }
+    }
+}

diff --git a/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
new file mode 100644
index 0000000..27d5929
--- /dev/null
+++ b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h

@@ -0,0 +1,1783 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "lw  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m);                                     \
+    val1_m = LW(psrc_m + 4);                                 \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint16_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SW(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint32_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SD(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint64_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "ulw  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m1);                                    \
+    val1_m = LW(psrc_m1 + 4);                                \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+#define SH(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint16_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SW(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint32_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SD(val, pdst)                                         \
+{                                                             \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+    uint32_t val0_m, val1_m;                                  \
+                                                              \
+    val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                              \
+    SW(val0_m, pdst_m1);                                      \
+    SW(val1_m, pdst_m1 + 4);                                  \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3)  \
+{                                                  \
+    out0 = LW((psrc));                             \
+    out1 = LW((psrc) + stride);                    \
+    out2 = LW((psrc) + 2 * stride);                \
+    out3 = LW((psrc) + 3 * stride);                \
+}
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1)  \
+{                                      \
+    out0 = LD((psrc));                 \
+    out1 = LD((psrc) + stride);        \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3)  \
+{                                                  \
+    LD2((psrc), stride, out0, out1);               \
+    LD2((psrc) + 2 * stride, stride, out2, out3);  \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SW(in0, (pdst));                           \
+    SW(in1, (pdst) + stride);                  \
+    SW(in2, (pdst) + 2 * stride);              \
+    SW(in3, (pdst) + 3 * stride);              \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SD(in0, (pdst));                           \
+    SD(in1, (pdst) + stride);                  \
+    SD(in2, (pdst) + 2 * stride);              \
+    SD(in3, (pdst) + 3 * stride);              \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_B(RTYPE, (psrc));                 \
+    out1 = LD_B(RTYPE, (psrc) + stride);        \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
+{                                                     \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);         \
+    out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
+}
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
+{                                                            \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);                \
+    LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
+{                                                                 \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
+    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
+}
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride,                                      \
+              out0, out1, out2, out3, out4, out5, out6, out7)           \
+{                                                                       \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_H(RTYPE, (psrc));                 \
+    out1 = LD_H(RTYPE, (psrc) + (stride));      \
+}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
+{                                                           \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);               \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
+}
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1)  \
+{                                         \
+    out0 = LD_SW((psrc));                 \
+    out1 = LD_SW((psrc) + stride);        \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_B(RTYPE, in0, (pdst));                 \
+    ST_B(RTYPE, in1, (pdst) + stride);        \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
+{                                                         \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);               \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
+              pdst, stride)                                         \
+{                                                                   \
+    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
+    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_H(RTYPE, in0, (pdst));                 \
+    ST_H(RTYPE, in1, (pdst) + stride);        \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride)  \
+{                                       \
+    ST_SW(in0, (pdst));                 \
+    ST_SW(in1, (pdst) + stride);        \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride)             \
+{                                                     \
+    uint16_t out0_m, out1_m, out2_m, out3_m;          \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                      \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                      \
+    SH(out0_m, pblk_2x4_m);                           \
+    SH(out1_m, pblk_2x4_m + stride);                  \
+    SH(out2_m, pblk_2x4_m + 2 * stride);              \
+    SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
+{                                                                 \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                      \
+                                                                  \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                    \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                    \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                    \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                    \
+                                                                  \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
+}
+#define ST4x8_UB(in0, in1, pdst, stride)                            \
+{                                                                   \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                    \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst)                  \
+{                                           \
+    uint64_t out0_m;                        \
+                                            \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);  \
+    SD(out0_m, pdst);                       \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride)            \
+{                                             \
+    uint64_t out0_m, out1_m;                  \
+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                              \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+    out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                              \
+    SD(out0_m, pblk_8x2_m);                   \
+    SD(out1_m, pblk_8x2_m + stride);          \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride)                      \
+{                                                             \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                              \
+    out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+    out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+    out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+    out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                              \
+    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)              \
+{                                                                      \
+    v16i8 zero_m = { 0 };                                              \
+                                                                       \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
+}
+#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
+{                                                                          \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);     \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);     \
+}
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,        \
+                out0, out1, out2, slide_val)                            \
+{                                                                       \
+    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val);  \
+    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
+}
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \
+{                                                                      \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);  \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
+                out0, out1, out2)                                          \
+{                                                                          \
+    VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
+    out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4);      \
+}
+#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
+
+/* Description : Shuffle halfword vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : halfword elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \
+{                                                                      \
+    out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);  \
+}
+#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);    \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
+                 cnst0, cnst1, cnst2, cnst3,                  \
+                 out0, out1, out2, out3)                      \
+{                                                             \
+    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);    \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                     \
+    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);    \
+}
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
+                 cnst0, cnst1, cnst2, cnst3,                  \
+                 out0, out1, out2, out3)                      \
+{                                                             \
+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);    \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)             \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
+    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                      \
+    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)             \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+}
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                      \
+    DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of it
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                           \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
+    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
+}
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in)                               \
+({                                                      \
+    v8i16 max_m = __msa_ldi_h(255);                     \
+    v8i16 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+    out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1)  \
+{                                 \
+    in0 = CLIP_SH_0_255(in0);     \
+    in1 = CLIP_SH_0_255(in1);     \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3)  \
+{                                           \
+    CLIP_SH2_0_255(in0, in1);               \
+    CLIP_SH2_0_255(in2, in3);               \
+}
+
+/* Description : Clips all signed word elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed word
+*/
+#define CLIP_SW_0_255(in)                               \
+({                                                      \
+    v4i32 max_m = __msa_ldi_w(255);                     \
+    v4i32 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_w((v4i32)in, 0);               \
+    out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m);  \
+    out_m;                                              \
+})
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in)                             \
+({                                                  \
+    v2i64 res0_m, res1_m;                           \
+    int32_t sum_m;                                  \
+                                                    \
+    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+    res1_m = __msa_splati_d(res0_m, 1);             \
+    res0_m = res0_m + res1_m;                       \
+    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+    sum_m;                                          \
+})
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                                \
+({                                                     \
+    v4u32 res_m;                                       \
+    v2u64 res0_m, res1_m;                              \
+    uint32_t sum_m;                                    \
+                                                       \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
+    res0_m = __msa_hadd_u_d(res_m, res_m);             \
+    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
+    res0_m = res0_m + res1_m;                          \
+    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
+    sum_m;                                             \
+})
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_D2(RTYPE, in0, in1, out)               \
+{                                                     \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);  \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);  \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);  \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);  \
+}
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);  \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);  \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);  \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);  \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)       \
+{                                                            \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Maximum values between signed elements of vector and
+                 5-bit signed immediate value are copied to the output vector
+   Arguments   : Inputs  - in0, in1, in2, in3, max_val
+                 Outputs - in place operation
+                 Return Type - unsigned halfword
+   Details     : Maximum of signed halfword element values from 'in0' and
+                 'max_val' are written in place
+*/
+#define MAXI_SH2(RTYPE, in0, in1, max_val)               \
+{                                                        \
+    in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val));  \
+    in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val));  \
+}
+#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val)             \
+{                                                     \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val)             \
+{                                                     \
+    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
+    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
+{                                                    \
+    SAT_SH2(RTYPE, in0, in1, sat_val);               \
+    SAT_SH2(RTYPE, in2, in3, sat_val);               \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
+{                                                     \
+    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);    \
+    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);    \
+}
+#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,      \
+                  out0, out1, out2)                 \
+{                                                   \
+    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);   \
+    out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2);  \
+}
+#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
+#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
+
+/* Description : Indexed word element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, stidx
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'stidx' element value from 'in' vector is replicated to all
+                 elements in 'out0' vector
+                 'stidx + 1' element value from 'in' vector is replicated to all
+                 elements in 'out1' vector
+                 Valid index range for word operation is 0-3
+*/
+#define SPLATI_W2(RTYPE, in, stidx, out0, out1)          \
+{                                                        \
+    out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);      \
+    out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx+1));  \
+}
+#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);  \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);  \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+/* Description : Pack odd double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Odd double word elements of 'in0' are copied to the left half
+                 of 'out0' & odd double word elements of 'in1' are copied to
+                 the right half of 'out0'.
+*/
+#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
+#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1)             \
+{                                                \
+    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
+    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2)        \
+{                                                \
+    XORI_B2_128(RTYPE, in0, in1);                \
+    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
+{                                               \
+    XORI_B2_128(RTYPE, in0, in1);               \
+    XORI_B2_128(RTYPE, in2, in3);               \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
+{                                                    \
+    XORI_B3_128(RTYPE, in0, in1, in2);               \
+    XORI_B2_128(RTYPE, in3, in4);                    \
+}
+#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
+
+#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
+{                                                                   \
+    XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
+    XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
+}
+#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift)  \
+{                                           \
+    in0 = in0 << shift;                     \
+    in1 = in1 << shift;                     \
+    in2 = in2 << shift;                     \
+    in3 = in3 << shift;                     \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift)  \
+{                                          \
+    in0 = in0 >> shift;                    \
+    in1 = in1 >> shift;                    \
+    in2 = in2 >> shift;                    \
+    in3 = in3 >> shift;                    \
+}
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift)                   \
+{                                                         \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                  \
+    SRAR_W2(RTYPE, in0, in1, shift);               \
+    SRAR_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                   \
+    SRARI_H2(RTYPE, in0, in1, shift);               \
+    SRARI_H2(RTYPE, in2, in3, shift);               \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                   \
+    SRARI_W2(RTYPE, in0, in1, shift);               \
+    SRARI_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 * in1;                         \
+    out1 = in2 * in3;                         \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    MUL2(in0, in1, in2, in3, out0, out1);             \
+    MUL2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 + in1;                         \
+    out1 = in2 + in3;                         \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    ADD2(in0, in1, in2, in3, out0, out1);             \
+    ADD2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 - in1;                         \
+    out1 = in2 - in3;                         \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    out0 = in0 - in1;                                 \
+    out1 = in2 - in3;                                 \
+    out2 = in4 - in5;                                 \
+    out3 = in6 - in7;                                 \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out)                     \
+{                                                  \
+    v8i16 sign_m;                                  \
+                                                   \
+    sign_m = __msa_clti_s_h((v8i16)in, 0);         \
+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1)       \
+{                                         \
+    v16i8 zero_m = { 0 };                 \
+                                          \
+    ILVRL_B2_SH(zero_m, in, out0, out1);  \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1)        \
+{                                          \
+    v8i16 tmp_m;                           \
+                                           \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                \
+    out0 = in0 + in3;                                            \
+    out1 = in1 + in2;                                            \
+                                                                 \
+    out2 = in1 - in2;                                            \
+    out3 = in0 - in3;                                            \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
+                        out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                        \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
+                                                                         \
+    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
+               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
+    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
+    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
+    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
+    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
+    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                         \
+    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                         \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3)                        \
+{                                                                          \
+    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+                                                                           \
+    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
+    out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
+    out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
+    out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+                                                                           \
+    tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);               \
+    tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);           \
+    out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+}
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                            in8, in9, in10, in11, in12, in13, in14, in15,    \
+                            out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                             \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
+                                                                             \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                 \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                 \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                 \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                 \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                   \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                 \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                   \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                 \
+                                                                             \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                 \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 s0_m, s1_m;                                                   \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);               \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);               \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                     \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);             \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);             \
+}
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
+                                                                        \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
+                                                                        \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);               \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);               \
+}
+
+/* Description : Dot product and addition of 3 signed halfword input vectors
+   Arguments   : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
+                 Output - out0_m
+                 Return Type - signed halfword
+   Details     : Dot product of 'in0' with 'coeff0'
+                 Dot product of 'in1' with 'coeff1'
+                 Dot product of 'in2' with 'coeff2'
+                 Addition of all the 3 vector results
+                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
+*/
+#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \
+({                                                                \
+    v8i16 tmp1_m;                                                 \
+    v8i16 out0_m;                                                 \
+                                                                  \
+    out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0);           \
+    out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1);  \
+    tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2);           \
+    out0_m = __msa_adds_s_h(out0_m, tmp1_m);                      \
+                                                                  \
+    out0_m;                                                       \
+})
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1)                         \
+({                                                         \
+    v16u8 out_m;                                           \
+    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
+    out_m;                                                 \
+})
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst)                 \
+{                                                   \
+    v16i8 tmp_m;                                    \
+    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+    ST_SB(tmp_m, (pdst));                           \
+}
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)         \
+({                                                               \
+    v16i8 tmp0_m;                                                \
+    v8u16 tmp1_m;                                                \
+                                                                 \
+    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
+    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
+    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
+                                                                 \
+    tmp1_m;                                                      \
+})
+#endif  /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */

diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h
index a46fbfb..f39b675 100644
--- a/libvpx/vp8/common/onyx.h
+++ b/libvpx/vp8/common/onyx.h

@@ -108,7 +108,8 @@
          * For temporal denoiser: noise_sensitivity = 0 means off,
          * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
          * noise_sensitivity = 2 means temporal denoiser on for all channels.
-         * noise_sensitivity >= 3 means aggressive denoising mode.
+         * noise_sensitivity = 3 means aggressive denoising mode.
+         * noise_sensitivity >= 4 means adaptive denoising mode.
          * Temporal denoiser is enabled via the configuration option:
          * CONFIG_TEMPORAL_DENOISING.
          * For spatial denoiser: noise_sensitivity controls the amount of
@@ -121,6 +122,7 @@
         int Sharpness;
         int cpu_used;
         unsigned int rc_max_intra_bitrate_pct;
+        unsigned int screen_content_mode;
 
         /* mode ->
          *(0)=Realtime/Live Encoding. This mode is optimized for realtim
@@ -223,7 +225,7 @@
         int arnr_strength;
         int arnr_type;
 
-        struct vpx_fixed_buf        two_pass_stats_in;
+        vpx_fixed_buf_t        two_pass_stats_in;
         struct vpx_codec_pkt_list  *output_pkt_list;
 
         vp8e_tuning tuning;

diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
index e50d393..a4e6ae1 100644
--- a/libvpx/vp8/common/postproc.c
+++ b/libvpx/vp8/common/postproc.c

@@ -214,6 +214,7 @@
     x = 50 + (x - 50) * 10 / 8;
     return x * x / 3;
 }
+
 void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
 {
     int r, c, i;
@@ -226,14 +227,14 @@
         int sumsq = 0;
         int sum   = 0;
 
-        for (i = -8; i<0; i++)
+        for (i = -8; i < 0; i++)
           s[i]=s[0];
 
         /* 17 avoids valgrind warning - we buffer values in c in d
          * and only write them when we've read 8 ahead...
          */
-        for (i = cols; i<cols+17; i++)
-          s[i]=s[cols-1];
+        for (i = 0; i < 17; i++)
+          s[i+cols]=s[cols-1];
 
         for (i = -8; i <= 6; i++)
         {
@@ -264,7 +265,6 @@
     }
 }
 
-
 void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 {
     int r, c, i;
@@ -284,8 +284,8 @@
         /* 17 avoids valgrind warning - we buffer values in c in d
          * and only write them when we've read 8 ahead...
          */
-        for (i = rows; i < rows+17; i++)
-          s[i*pitch]=s[(rows-1)*pitch];
+        for (i = 0; i < 17; i++)
+          s[(i+rows)*pitch]=s[(rows-1)*pitch];
 
         for (i = -8; i <= 6; i++)
         {
@@ -355,8 +355,8 @@
                 else
                     mb_ppl = (unsigned char)ppl;
 
-                vpx_memset(ylptr, mb_ppl, 16);
-                vpx_memset(uvlptr, mb_ppl, 8);
+                memset(ylptr, mb_ppl, 16);
+                memset(uvlptr, mb_ppl, 8);
 
                 ylptr += 16;
                 uvlptr += 8;
@@ -385,13 +385,13 @@
 }
 #endif
 
-#if !(CONFIG_TEMPORAL_DENOISING)
 void vp8_de_noise(VP8_COMMON                 *cm,
                   YV12_BUFFER_CONFIG         *source,
                   YV12_BUFFER_CONFIG         *post,
                   int                         q,
                   int                         low_var_thresh,
-                  int                         flag)
+                  int                         flag,
+                  int                         uvfilter)
 {
     int mbr;
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
@@ -403,7 +403,7 @@
     (void) low_var_thresh;
     (void) flag;
 
-    vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+    memset(limits, (unsigned char)ppl, 16 * mb_cols);
 
     /* TODO: The original code don't filter the 2 outer rows and columns. */
     for (mbr = 0; mbr < mb_rows; mbr++)
@@ -412,20 +412,22 @@
             source->y_buffer + 16 * mbr * source->y_stride,
             source->y_buffer + 16 * mbr * source->y_stride,
             source->y_stride, source->y_stride, source->y_width, limits, 16);
-
-        vp8_post_proc_down_and_across_mb_row(
-            source->u_buffer + 8 * mbr * source->uv_stride,
-            source->u_buffer + 8 * mbr * source->uv_stride,
-            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
-        vp8_post_proc_down_and_across_mb_row(
-            source->v_buffer + 8 * mbr * source->uv_stride,
-            source->v_buffer + 8 * mbr * source->uv_stride,
-            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+        if (uvfilter == 1) {
+          vp8_post_proc_down_and_across_mb_row(
+              source->u_buffer + 8 * mbr * source->uv_stride,
+              source->u_buffer + 8 * mbr * source->uv_stride,
+              source->uv_stride, source->uv_stride, source->uv_width, limits,
+              8);
+          vp8_post_proc_down_and_across_mb_row(
+              source->v_buffer + 8 * mbr * source->uv_stride,
+              source->v_buffer + 8 * mbr * source->uv_stride,
+              source->uv_stride, source->uv_stride, source->uv_width, limits,
+              8);
+        }
     }
 }
-#endif
 
-double vp8_gaussian(double sigma, double mu, double x)
+static double gaussian(double sigma, double mu, double x)
 {
     return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
            (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
@@ -453,7 +455,7 @@
 
         for (i = -32; i < 32; i++)
         {
-            const int v = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+            const int v = (int)(.5 + 256 * gaussian(sigma, 0, i));
 
             if (v)
             {
@@ -516,6 +518,7 @@
                            unsigned int Width, unsigned int Height, int Pitch)
 {
     unsigned int i, j;
+    (void)bothclamp;
 
     for (i = 0; i < Height; i++)
     {
@@ -760,7 +763,7 @@
             /* insure that postproc is set to all 0's so that post proc
              * doesn't pull random data in from edge
              */
-            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+            memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
 
         }
     }

diff --git a/libvpx/vp8/common/postproc.h b/libvpx/vp8/common/postproc.h
index 33d0a7f..0fa12a7 100644
--- a/libvpx/vp8/common/postproc.h
+++ b/libvpx/vp8/common/postproc.h

@@ -39,7 +39,8 @@
                   YV12_BUFFER_CONFIG         *post,
                   int                         q,
                   int                         low_var_thresh,
-                  int                         flag);
+                  int                         flag,
+                  int                         uvfilter);
 
 void vp8_deblock(struct VP8Common           *oci,
                  YV12_BUFFER_CONFIG         *source,

diff --git a/libvpx/vp8/common/ppc/copy_altivec.asm b/libvpx/vp8/common/ppc/copy_altivec.asm
deleted file mode 100644
index a4ce915..0000000
--- a/libvpx/vp8/common/ppc/copy_altivec.asm
+++ /dev/null

@@ -1,47 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/libvpx/vp8/common/ppc/filter_altivec.asm b/libvpx/vp8/common/ppc/filter_altivec.asm
deleted file mode 100644
index 4da2e94..0000000
--- a/libvpx/vp8/common/ppc/filter_altivec.asm
+++ /dev/null

@@ -1,1013 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl sixtap_predict_ppc
-    .globl sixtap_predict8x4_ppc
-    .globl sixtap_predict8x8_ppc
-    .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
-    load_c \V0, HFilter, r5, r9, r10
-
-    addi    r5,  r5, 16
-    lvx     \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
-    load_c v0, VFilter, r6, r3, r10
-
-    vspltish v5, 8
-    vspltish v6, 3
-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v1, v0, 1
-    vspltb  v2, v0, 2
-    vspltb  v3, v0, 3
-    vspltb  v4, v0, 4
-    vspltb  v5, v0, 5
-    vspltb  v0, v0, 0
-.endm
-
-.macro vpre_load
-    Vprolog
-    li      r10,  16
-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
-    lvx     v11, r10, r9
-    addi    r9,   r9, 32
-    lvx     v12,   0, r9
-    lvx     v13, r10, r9
-    addi    r9,   r9, 32
-    lvx     v14,   0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
-                                ;# (Re,Ro) += (V*T)
-    vmuleub \TMP, \V, \T        ;# trashes v8
-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
-    vmuloub \TMP, \V, \T
-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
-    vadduhm v16, v6, v8
-    vmuloub  v8, \P0, v0
-    vadduhm v17, v6, v8
-    Msum v16, v17, \P2, v2, v8
-    Msum v16, v17, \P3, v3, v8
-    Msum v16, v17, \P5, v5, v8
-
-    vmuleub v18, \P1, v1        ;# 2 negative taps
-    vmuloub v19, \P1, v1
-    Msum v18, v19, \P4, v4, v8
-
-    vsubuhs v16, v16, v18       ;# subtract neg from pos
-    vsubuhs v17, v17, v19
-    vsrh    v16, v16, v7        ;# divide by 128
-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
-    vmrglh  v19, v16, v17
-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
-    vadduhm v21, v20, v24
-    vmuloub v24, \P0, v13
-    vadduhm v22, v20, v24
-    Msum v21, v22, \P2, v15, v25
-    Msum v21, v22, \P3, v16, v25
-    Msum v21, v22, \P5, v18, v25
-
-    vmuleub v23, \P1, v14       ;# 2 negative taps
-    vmuloub v24, \P1, v14
-    Msum v23, v24, \P4, v17, v25
-
-    vsubuhs v21, v21, v23       ;# subtract neg from pos
-    vsubuhs v22, v22, v24
-    vsrh    v21, v21, v19       ;# divide by 128
-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
-    vmrglh  v24, v21, v22
-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
-    stvx    \P0, 0, r7
-    add     r7, r7, r8      ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
-    addi    r9,   r9, 16        ;# P5 = newest input row
-    lvx     \P5,   0, r9
-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
-    luma_v v10, v11, v12, v13, v14, v15
-    luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
-    luma_vtwo
-    luma_v v12, v13, v14, v15, v10, v11
-    luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
-    luma_vfour
-    luma_v v14, v15, v10, v11, v12, v13
-    luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
-    vmsummbm \R, v13, \I, v15
-    vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     \VD,   0, \RS
-    lvx     v20, r10, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
-
-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
-    vsrh    \R, \R, v19
-
-    vpkuhus \R, \R, \R          ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v20,   0, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, v20, v20, v21
-.endm
-    .text
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xff87
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    vertical_only_4x4
-
-    ;# load up horizontal filter
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_4x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_4x4
-
-vertical_only_4x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0, r3, r4, 1
-    Read8x8 v1, r3, r4, 1
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_4x4:
-    load_c   v20, b_hilo_4x4, 0, r9, r10
-    load_c   v21, b_hilo, 0, r9, r10
-
-    ;# reposition input so that it can go through the
-    ;# filtering phase with one pass.
-    vperm   v0, v0, v1, v20     ;# 0 1 x x
-    vperm   v2, v2, v3, v20     ;# 2 3 x x
-    vperm   v4, v4, v5, v20     ;# 4 5 x x
-    vperm   v6, v6, v7, v20     ;# 6 7 x x
-
-    vperm   v0, v0, v2, v21     ;# 0 1 2 3
-    vperm   v4, v4, v6, v21     ;# 4 5 6 7
-
-    vsldoi  v1, v0, v4, 4
-    vsldoi  v2, v0, v4, 8
-    vsldoi  v3, v0, v4, 12
-
-    vsldoi  v5, v4, v8, 4
-
-    load_c   v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
-    stvx    v0, 0, r1
-
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 4(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 8(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 12(r1)
-    stw     r0, 0(r7)
-
-    b       exit_4x4
-
-store_4x4:
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v4, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v5, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x4
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_8x4
-
-second_pass_pre_copy_8x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x4:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-    b       exit_8x4
-
-store_8x4:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x4
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned2_8x4:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;#  register there is no need to loop.  Everything can stay in registers.
-sixtap_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x8
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 1
-    Read8x8 v9, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-    interp_8x8 v9
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x8
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0,  r9, r4, 1
-    Read8x8 v1,  r9, r4, 0
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v10
-    interp_8x8 v11
-    interp_8x8 v12
-
-    b       second_pass_8x8
-
-second_pass_pre_copy_8x8:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-    Read8x8 v9,  r3, r4, 1
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x8:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-    b       exit_8x8
-
-store_8x8:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x8
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-    w_8x8   v8, r7, r0, r8
-    w_8x8   v9, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned2_8x8:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-    vperm   v8, v8, v9, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-    addi    r7, r7, 16
-    stvx    v8, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
-;#  temporary buffer because the source buffer can't be modified and the buffer
-;#  for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    ;# Three possiblities
-    ;#  1. First filter is null.  Don't use a temp buffer.
-    ;#  2. Second filter is null.  Don't use a temp buffer.
-    ;#  3. Neither are null, use temp buffer.
-
-    ;# First Pass (horizontal edge)
-    ;#  setup pointers for src
-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
-    ;#  to second pass.  this is based on if x_offset is 0.
-
-    ;# load up horizontal filter
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    load_hfilter v4, v5
-
-    beq-    copy_horizontal_16x21
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v14, b_hperm, 0, r9, r10
-
-    ;# These statements are guessing that there won't be a second pass,
-    ;#  but if there is then inside the bypass they need to be set
-    li      r0, 16              ;# prepare for no vertical filter
-
-    ;# Change the output pointer and pitch to be the actual
-    ;#  desination instead of a temporary buffer.
-    addi    r9, r7, 0
-    addi    r5, r8, 0
-
-    ;# no vertical filter, so write the output from the first pass
-    ;#  directly into the output buffer.
-    beq-    no_vertical_filter_bypass
-
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# setup counter for the number of lines that are going to be filtered
-    li      r0, 21
-
-    ;# use the stack as temporary storage
-    la      r9, 48(r1)
-    li      r5, 16
-
-no_vertical_filter_bypass:
-
-    mtctr   r0
-
-    ;# rounding added in on the multiply
-    vspltisw v10, 8
-    vspltisw v12, 3
-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v13, 7
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-horizontal_loop_16x16:
-
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-    lvx     v3, r12, r3
-
-    vperm   v8, v1, v2, v15
-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
-
-    vsldoi  v11, v8, v9, 4
-
-    ;# set 0
-    vmsummbm v6, v4, v8, v12    ;# taps times elements
-    vmsummbm v0, v5, v11, v6
-
-    ;# set 1
-    vsldoi  v10, v8, v9, 1
-    vsldoi  v11, v8, v9, 5
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v1, v5, v11, v6
-
-    ;# set 2
-    vsldoi  v10, v8, v9, 2
-    vsldoi  v11, v8, v9, 6
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v2, v5, v11, v6
-
-    ;# set 3
-    vsldoi  v10, v8, v9, 3
-    vsldoi  v11, v8, v9, 7
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v3, v5, v11, v6
-
-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
-
-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
-    vsrh    v1, v1, v13
-
-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
-
-    stvx    v0,  0, r9
-    add     r9, r9, r5
-
-    add     r3, r3, r4
-
-    bdnz    horizontal_loop_16x16
-
-    ;# check again to see if vertical filter needs to be done.
-    cmpi    cr0, r6, 0
-    beq     cr0, end_16x16
-
-    ;# yes there is, so go to the second pass
-    b       second_pass_16x16
-
-copy_horizontal_16x21:
-    li      r10, 21
-    mtctr   r10
-
-    li      r10, 16
-
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# this is done above if there is a horizontal filter,
-    ;#  if not it needs to be done down here.
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-    ;# always write to the stack when doing a horizontal copy
-    la      r9, 48(r1)
-
-copy_horizontal_loop_16x21:
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v8, v1, v2, v15
-
-    stvx    v8,  0, r9
-    addi    r9, r9, 16
-
-    add     r3, r3, r4
-
-    bdnz    copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
-    ;# always read from the stack when doing a vertical filter
-    la      r9, 48(r1)
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v7, 7
-
-    vpre_load
-
-    luma_vsix
-    luma_vsix
-    luma_vfour
-
-end_16x16:
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-HFilter:
-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-
-    .align 4
-VFilter:
-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-
-    .align 4
-b_hperm:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-B_0123:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-B_4567:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-    .align 4
-B_89AB:
-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
-    .align 4
-b_hilo:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-b_hilo_4x4:
-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0

diff --git a/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm b/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
deleted file mode 100644
index fd8aa66..0000000
--- a/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null

@@ -1,677 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/libvpx/vp8/common/ppc/idctllm_altivec.asm b/libvpx/vp8/common/ppc/idctllm_altivec.asm
deleted file mode 100644
index 117d9cf..0000000
--- a/libvpx/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null

@@ -1,189 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4llm_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

diff --git a/libvpx/vp8/common/ppc/loopfilter_altivec.c b/libvpx/vp8/common/ppc/loopfilter_altivec.c
deleted file mode 100644
index 71bf6e2..0000000
--- a/libvpx/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null

@@ -1,135 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-    unsigned char *u,   // source pointer
-    unsigned char *v,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    // These should all be done at once with one call, instead of 3
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}

diff --git a/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm b/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
deleted file mode 100644
index 61df4e9..0000000
--- a/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null

@@ -1,1253 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl mbloop_filter_horizontal_edge_y_ppc
-    .globl loop_filter_horizontal_edge_y_ppc
-    .globl mbloop_filter_vertical_edge_y_ppc
-    .globl loop_filter_vertical_edge_y_ppc
-
-    .globl mbloop_filter_horizontal_edge_uv_ppc
-    .globl loop_filter_horizontal_edge_uv_ppc
-    .globl mbloop_filter_vertical_edge_uv_ppc
-    .globl loop_filter_vertical_edge_uv_ppc
-
-    .globl loop_filter_simple_horizontal_edge_ppc
-    .globl loop_filter_simple_vertical_edge_ppc
-
-    .text
-;# We often need to perform transposes (and other transpose-like operations)
-;#   on matrices of data.  This is simplified by the fact that we usually
-;#   operate on hunks of data whose dimensions are powers of 2, or at least
-;#   divisible by highish powers of 2.
-;#
-;#   These operations can be very confusing.  They become more straightforward
-;#   when we think of them as permutations of address bits: Concatenate a
-;#   group of vector registers and think of it as occupying a block of
-;#   memory beginning at address zero.  The low four bits 0...3 of the
-;#   address then correspond to position within a register, the higher-order
-;#   address bits select the register.
-;#
-;#   Although register selection, at the code level, is arbitrary, things
-;#   are simpler if we use contiguous ranges of register numbers, simpler
-;#   still if the low-order bits of the register number correspond to
-;#   conceptual address bits.  We do this whenever reasonable.
-;#
-;#   A 16x16 transpose can then be thought of as an operation on
-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
-;#   memory and the effect of a transpose is to interchange address bit
-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
-;#   column, which is interchanged with the row addressed by bits 4..7.
-;#
-;#   The altivec merge instructions provide a rapid means of effecting
-;#   many of these transforms.  They operate at three widths (8,16,32).
-;#   Writing V(x) for vector register #x, paired merges permute address
-;#   indices as follows.
-;#
-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
-;#
-;#      vmrghb  V( x),          V( y), V( y + (1<<s))
-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
-;#
-;#      vmrghh  V( x),          V( y), V( y + (1<<s))
-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
-;#
-;#      vmrghw  V( x),          V( y), V( y + (1<<s))
-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   Unfortunately, there is no doubleword merge instruction.
-;#   The following sequence uses "vperm" is a substitute.
-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;#   are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
-;#
-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;#   Except for bits s and d, the other relationships between register
-;#   number (= high-order part of address) bits are at the disposal of
-;#   the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;#   edges together.  This requires a single 16x16 transpose, which, in
-;#   the above language, amounts to the following permutation of address
-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;#   Except for the fact that the destination registers get written
-;#   before we are done referencing the old contents, the cyclic transform
-;#   is effected by
-;#
-;#      x = 0;  do {
-;#          vmrghb V(2x),   V(x), V(x+8);
-;#          vmrghb V(2x+1), V(x), V(x+8);
-;#      } while( ++x < 8);
-;#
-;#   For clarity, and because we can afford it, we do this transpose
-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
-;#   leaving the final result in 16 .. 31, as the lower registers are
-;#   used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
-    vmrghb  \A, \X, \Y
-    vmrglb  \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
-    Tpair v16,v17,  v0,v8
-    Tpair v18,v19,  v1,v9
-    Tpair v20,v21,  v2,v10
-    Tpair v22,v23,  v3,v11
-    Tpair v24,v25,  v4,v12
-    Tpair v26,v27,  v5,v13
-    Tpair v28,v29,  v6,v14
-    Tpair v30,v31,  v7,v15
-.endm
-
-.macro t16_odd
-    Tpair v0,v1, v16,v24
-    Tpair v2,v3, v17,v25
-    Tpair v4,v5, v18,v26
-    Tpair v6,v7, v19,v27
-    Tpair v8,v9, v20,v28
-    Tpair v10,v11, v21,v29
-    Tpair v12,v13, v22,v30
-    Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
-    t16_odd
-    t16_even
-    t16_odd
-    t16_even
-.endm
-
-;# Vertical edge filtering requires transposes.  For the simple filter,
-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;#  v0 =  0  1 ... 14 15
-;#  v1 = 16 17 ... 30 31
-;#  v2 = 32 33 ... 47 48
-;#  v3 = 49 50 ... 62 63
-;#
-;#  In frame-buffer memory, the layout is:
-;#
-;#     0  16  32  48
-;#     1  17  33  49
-;#     ...
-;#    15  31  47  63.
-;#
-;#  We begin by reading the data 32 bits at a time (using scalar operations)
-;#  into a temporary array, reading the rows of the array into vector registers,
-;#  with the following layout:
-;#
-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
-;#  v1 =  1 17 33 49  5 21 ...                      45 61
-;#  v2 =  2 18 ...                                  46 62
-;#  v3 =  3 19 ...                                  47 63
-;#
-;#  From the "address-bit" perspective discussed above, we simply need to
-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;#  In other words, we transpose each of the four 4x4 submatrices.
-;#
-;#  This transformation is its own inverse, and we need to perform it
-;#  again before writing the pixels back into the frame buffer.
-;#
-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;#  defined above.  We think of both groups of 4 registers as having
-;#  "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
-
-    vmrghb  v4, v0, v1
-    vmrglb  v5, v0, v1
-    vmrghb  v6, v2, v3
-    vmrglb  v7, v2, v3
-
-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
-
-    vmrghh  v0, v4, v6
-    vmrglh  v1, v4, v6
-    vmrghh  v2, v5, v7
-    vmrglh  v3, v5, v7
-
-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
-
-    vmrghw  v4, v0, v1
-    vmrglw  v5, v0, v1
-    vmrghw  v6, v2, v3
-    vmrglw  v7, v2, v3
-
-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
-
-    vperm   v0, v4, v6, \Vlo
-    vperm   v1, v4, v6, \Vhi
-    vperm   v2, v5, v7, \Vlo
-    vperm   v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;#   We read 8 columns of data, initially in the following pattern:
-;#
-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
-;#  ...
-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;#   and wish to convert to:
-;#
-;#  (0,0) ... (0,15)
-;#  (1,0) ... (1,15)
-;#  ...
-;#  (7,0) ... (7,15).
-;#
-;#  In "address bit" language, we wish to map
-;#
-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
-;#
-;#  This can be accomplished by 4 iterations of the cyclic transform
-;#
-;#  I -> (I+1) mod 7;
-;#
-;#  each iteration can be realized by (d=0, s=2):
-;#
-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
-;#
-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
-;#  preserving v8 = sign converter.
-;#
-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;#  result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
-    Tpair v10, v11,  v0, v4
-    Tpair v12, v13,  v1, v5
-    Tpair v14, v15,  v2, v6
-    Tpair v16, v17,  v3, v7
-.endm
-
-.macro t8x16_even
-    Tpair v0, v1,  v10, v14
-    Tpair v2, v3,  v11, v15
-    Tpair v4, v5,  v12, v16
-    Tpair v6, v7,  v13, v17
-.endm
-
-.macro transpose8x16_fwd
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-    t8x16_even
-.endm
-
-.macro transpose8x16_inv
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-.endm
-
-.macro Transpose16x16
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;#   into vector register Vreg.  Trashes r0
-.macro load_g Vreg, Gptr
-    lwz     r0, \Gptr
-    lvx     \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here.  if the answer is negative
-;# it will be clamped to 0.  orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
-    vsububs \RES, \A, \B
-    vsububs \TMP, \B, \A
-    vor     \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
-    vsububs \TMP, \A, \B
-    vmaxub  \RES, \RES, \TMP
-    vsububs \TMP, \B, \A
-    vmaxub  \RES, \RES, \TMP
-.endm
-
-.macro Masks
-    ;# build masks
-    ;# input is all 8 bit unsigned (0-255).  need to
-    ;# do abs(vala-valb) > limit.  but no need to compare each
-    ;# value to the limit.  find the max of the absolute differences
-    ;# and compare that to the limit.
-    ;# First hev
-    Abs     v14, v13, v2, v3    ;# |P1 - P0|
-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
-
-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
-
-    ;# Next limit
-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
-
-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
-
-    ;# flimit
-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
-
-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
-
-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
-    ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
-    ;# build constants
-    lvx     \FL, 0, \RFL        ;# flimit
-    lvx     \LI, 0, \RLI        ;# limit
-    lvx     \TH, 0, \RTH        ;# thresh
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
-    ;# setup strides/pointers to be able to access
-    ;# all of the data
-    add     r5, r4, r4          ;# r5 = 2 * stride
-    sub     r6, r3, r5          ;# r6 -> 2 rows back
-    neg     r7, r4              ;# r7 = -stride
-
-    ;# load 16 pixels worth of data to work on
-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
-    lvx     v0,  0, r0          ;# P3  (read only)
-    lvx     v1, r7, r6          ;# P2
-    lvx     v2,  0, r6          ;# P1
-    lvx     v3, r7, r3          ;# P0
-    lvx     v4,  0, r3          ;# Q0
-    lvx     v5, r4, r3          ;# Q1
-    lvx     v6, r5, r3          ;# Q2
-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
-    lvx     v7, r4, r0          ;# Q3  (read only)
-.endm
-
-;# Expects
-;#  v10 == HEV
-;#  v13 == tmp
-;#  v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
-    vxor    \P1, \P1, v11       ;# SP1
-    vxor    \P0, \P0, v11       ;# SP0
-    vxor    \Q0, \Q0, v11       ;# SQ0
-    vxor    \Q1, \Q1, v11       ;# SQ1
-
-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
-.if \HEV_PRESENT
-    vand    v13, v13, v10       ;# f &= hev
-.endif
-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
-
-    vsrab   v13, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
-    Masks
-
-    ;# start the fitering here
-    vxor    v1, v1, v11         ;# SP2
-    vxor    v2, v2, v11         ;# SP1
-    vxor    v3, v3, v11         ;# SP0
-    vxor    v4, v4, v11         ;# SQ0
-    vxor    v5, v5, v11         ;# SQ1
-    vxor    v6, v6, v11         ;# SQ2
-
-    ;# add outer taps if we have high edge variance
-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
-
-    vsubsbs v14, v4, v3         ;# SQ0-SP0
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-    vand    v15, v13, v10       ;# f2 = f & hev
-
-    ;# save bottom 3 bits so that we round one side +4 and the other +3
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
-
-    vsrab   v14, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
-
-    ;# only apply wider filter if not high edge variance
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vspltisb v9, 2
-    vnor    v8, v8, v8
-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
-    vspltisb v8, 9
-
-    ;# roughly 1/7th difference across boundary
-    vspltish v10, 7
-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v8, v13
-    vaddshs v14, v14, v9        ;# +=  63
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
-    vaddsbs v1, v1, v10
-
-    vxor    v6, v6, v11
-    vxor    v1, v1, v11
-
-    ;# roughly 2/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v8, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
-    vaddsbs v2, v2, v10
-
-    vxor    v5, v5, v11
-    vxor    v2, v2, v11
-
-    ;# roughly 3/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v12, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
-    vaddsbs v3, v3, v10
-
-    vxor    v4, v4, v11
-    vxor    v3, v3, v11
-.endm
-
-.macro SBFilter
-    Masks
-
-    common_adjust v3, v4, v2, v5, 1
-
-    ;# outer tap adjustments
-    vspltisb v8, 1
-
-    vaddubm v13, v13, v8        ;# f  += 1
-    vsrab   v13, v13, v8        ;# f >>= 1
-
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
-
-    vxor    v2, v2, v11
-    vxor    v3, v3, v11
-    vxor    v4, v4, v11
-    vxor    v5, v5, v11
-.endm
-
-    .align 2
-mbloop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    vp8_mbfilter
-
-    stvx     v1, r7, r6         ;# P2
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-    stvx     v6, r5, r3         ;# Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    SBFilter
-
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
-;#  So we can read in an entire mb aligned.  However if we want to filter the mb
-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
-;#  of a waste.  So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;#  memory to align and order them up.  Then they are read in using the
-;#  vector register file.
-.macro RLVmb V, R
-    lwzux   r0, r3, r4
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r3, r4
-    stw     r0,12(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-.macro WLVmb V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 8(\R)
-    stw     r0,-4(r3)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-    sub     r3, r3, r4
-
-    RLVmb v0, r9
-    RLVmb v1, r9
-    RLVmb v2, r9
-    RLVmb v3, r9
-    RLVmb v4, r9
-    RLVmb v5, r9
-    RLVmb v6, r9
-    RLVmb v7, r9
-
-    transpose8x16_fwd
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    vp8_mbfilter
-
-    transpose8x16_inv
-
-    add r3, r3, r4
-    neg r4, r4
-
-    WLVmb v17, r9
-    WLVmb v16, r9
-    WLVmb v15, r9
-    WLVmb v14, r9
-    WLVmb v13, r9
-    WLVmb v12, r9
-    WLVmb v11, r9
-    WLVmb v10, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RL V, R, P
-    lvx     \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro WL V, R, P
-    stvx    \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
-                                ;# K = |P0-P1| already
-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
-    vcmpgtub v10, v14, v0
-
-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
-
-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
-
-    vmaxub   v14, v14, v4       ;# M = max interior abs diff
-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
-
-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
-
-    ;# replace P1,Q1 w/signed versions
-    common_adjust \P0, \Q0, \P1, \Q1, 1
-
-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
-    vsrab   v13, v13, v1
-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
-    vsubsbs \Q1, \Q1, v13
-    vaddsbs \P1, \P1, v13
-
-    vxor    \P1, \P1, v11       ;# P1
-    vxor    \P0, \P0, v11       ;# P0
-    vxor    \Q0, \Q0, v11       ;# Q0
-    vxor    \Q1, \Q1, v11       ;# Q1
-.endm
-
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    addi    r9, r3, 0
-    RL      v16, r9, r4
-    RL      v17, r9, r4
-    RL      v18, r9, r4
-    RL      v19, r9, r4
-    RL      v20, r9, r4
-    RL      v21, r9, r4
-    RL      v22, r9, r4
-    RL      v23, r9, r4
-    RL      v24, r9, r4
-    RL      v25, r9, r4
-    RL      v26, r9, r4
-    RL      v27, r9, r4
-    RL      v28, r9, r4
-    RL      v29, r9, r4
-    RL      v30, r9, r4
-    lvx     v31, 0, r9
-
-    Transpose16x16
-
-    vspltisb v1, 1
-
-    build_constants r5, r6, r7, v3, v2, v0
-
-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
-
-    Fil v16, v17, v18, v19,  v20, v21, v22, v23
-    Fil v20, v21, v22, v23,  v24, v25, v26, v27
-    Fil v24, v25, v26, v27,  v28, v29, v30, v31
-
-    Transpose16x16
-
-    addi    r9, r3, 0
-    WL      v16, r9, r4
-    WL      v17, r9, r4
-    WL      v18, r9, r4
-    WL      v19, r9, r4
-    WL      v20, r9, r4
-    WL      v21, r9, r4
-    WL      v22, r9, r4
-    WL      v23, r9, r4
-    WL      v24, r9, r4
-    WL      v25, r9, r4
-    WL      v26, r9, r4
-    WL      v27, r9, r4
-    WL      v28, r9, r4
-    WL      v29, r9, r4
-    WL      v30, r9, r4
-    stvx    v31, 0, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
-    andi.   r7, r3, 8       ;# row origin modulo 16
-    add     r7, r7, r7      ;# selects selectors
-    lis     r12, _chromaSelectors@ha
-    la      r0,  _chromaSelectors@l(r12)
-    lwzux   r0, r7, r0      ;# leave selector addr in r7
-
-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
-    lvx     \U, \Offs, r3
-    lvx     \V, \Offs, r4
-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
-    vperm   \V, \New, \V, \Vmask
-    stvx    \U, \Offs, r3           ;# Write to frame buffer
-    stvx    \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
-    neg     r9, r5          ;# r9 = -1 * stride
-    add     r8, r9, r9      ;# r8 = -2 * stride
-    add     r10, r5, r5     ;# r10 = 2 * stride
-
-    active_chroma_sel v12
-
-    ;# P3, Q3 are read-only; need not save addresses or sibling pels
-    add     r6, r8, r8      ;# r6 = -4 * stride
-    hread_uv v0, v14, v15, r6, v12
-    add     r6, r10, r5     ;# r6 =  3 * stride
-    hread_uv v7, v14, v15, r6, v12
-
-    ;# Others are read/write; save addresses and sibling pels
-
-    add     r6, r8, r9      ;# r6 = -3 * stride
-    hread_uv v1, v16, v17, r6,  v12
-    hread_uv v2, v18, v19, r8,  v12
-    hread_uv v3, v20, v21, r9,  v12
-    hread_uv v4, v22, v23, 0,   v12
-    hread_uv v5, v24, v25, r5,  v12
-    hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
-    load_g   \V, 4(r7)
-.endm
-
-.macro vresult_sel V
-    load_g   \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
-    uresult_sel v11
-    vresult_sel v12
-    hwrite_uv v2, v18, v19, r8, v11, v12
-    hwrite_uv v3, v20, v21, r9, v11, v12
-    hwrite_uv v4, v22, v23, 0,  v11, v12
-    hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    vp8_mbfilter
-
-    store_chroma_h
-
-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    SBFilter
-
-    store_chroma_h
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro R V, R
-    lwzux   r0, r3, r5
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r4, r5
-    stw     r0,12(\R)
-    lwz     r0,-4(r4)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-
-.macro W V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r4, r5
-    lwz     r0, 8(\R)
-    stw     r0,-4(r4)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r5
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-.macro chroma_vread R
-    sub r3, r3, r5          ;# back up one line for simplicity
-    sub r4, r4, r5
-
-    R v0, \R
-    R v1, \R
-    R v2, \R
-    R v3, \R
-    R v4, \R
-    R v5, \R
-    R v6, \R
-    R v7, \R
-
-    transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
-    transpose8x16_inv
-
-    add     r3, r3, r5
-    add     r4, r4, r5
-    neg     r5, r5          ;# Write rows back in reverse order
-
-    W v17, \R
-    W v16, \R
-    W v15, \R
-    W v14, \R
-    W v13, \R
-    W v12, \R
-    W v11, \R
-    W v10, \R
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    vp8_mbfilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    SBFilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
-
-    ;# preserve unsigned v0 and v3
-    common_adjust v1, v2, v0, v3, 0
-
-    vxor v1, v1, v11
-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
-    addi    r8,  0, 16
-    addi    r7, r5, 32
-
-    lvx     v0,  0, r5
-    lvx     v1, r8, r5
-    lvx     v2,  0, r7
-    lvx     v3, r8, r7
-
-    lis     r12, _B_hihi@ha
-    la      r0,  _B_hihi@l(r12)
-    lvx     v16, 0, r0
-
-    lis     r12, _B_lolo@ha
-    la      r0,  _B_lolo@l(r12)
-    lvx     v17, 0, r0
-
-    Transpose4times4x4 v16, v17
-    vp8_simple_filter
-
-    vxor v0, v0, v11
-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
-
-    Transpose4times4x4 v16, v17
-
-    stvx    v0,  0, r5
-    stvx    v1, r8, r5
-    stvx    v2,  0, r7
-    stvx    v3, r8, r7
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    neg     r5, r4              ;# r5 = -1 * stride
-    add     r6, r5, r5          ;# r6 = -2 * stride
-
-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
-
-    vp8_simple_filter
-
-    stvx    v1, r5, r3          ;# store P0
-    stvx    v2,  0, r3          ;# store Q0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RLV Offs
-    stw     r0, (\Offs*4)(r5)
-    lwzux   r0, r7, r4
-.endm
-
-.macro WLV Offs
-    lwz     r0, (\Offs*4)(r5)
-    stwux   r0, r7, r4
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    la r5, -96(r1)              ;# temporary space for reading in vectors
-
-    ;# Store 4 pels at word "Offs" in temp array, then advance r7
-    ;#   to next row and read another 4 pels from the frame buffer.
-
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r7          ;# read first 4 pels
-
-    ;# 16 unaligned word accesses
-    RLV 0
-    RLV 4
-    RLV 8
-    RLV 12
-    RLV 1
-    RLV 5
-    RLV 9
-    RLV 13
-    RLV 2
-    RLV 6
-    RLV 10
-    RLV 14
-    RLV 3
-    RLV 7
-    RLV 11
-
-    stw     r0, (15*4)(r5)      ;# write last 4 pels
-
-    simple_vertical
-
-    ;# Read temp array, write frame buffer.
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r5          ;# read/write first 4 pels
-    stwx    r0,  0, r7
-
-    WLV 4
-    WLV 8
-    WLV 12
-    WLV 1
-    WLV 5
-    WLV 9
-    WLV 13
-    WLV 2
-    WLV 6
-    WLV 10
-    WLV 14
-    WLV 3
-    WLV 7
-    WLV 11
-    WLV 15
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-_chromaSelectors:
-    .long   _B_hihi
-    .long   _B_Ures0
-    .long   _B_Vres0
-    .long   0
-    .long   _B_lolo
-    .long   _B_Ures8
-    .long   _B_Vres8
-    .long   0
-
-    .align 4
-_B_Vres8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
-
-    .align 4
-_B_Ures8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
-
-    .align 4
-_B_lolo:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_Vres0:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-    .align 4
-_B_Ures0:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_hihi:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/libvpx/vp8/common/ppc/platform_altivec.asm b/libvpx/vp8/common/ppc/platform_altivec.asm
deleted file mode 100644
index f81d86f..0000000
--- a/libvpx/vp8/common/ppc/platform_altivec.asm
+++ /dev/null

@@ -1,59 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl save_platform_context
-    .globl restore_platform_context
-
-.macro W V P
-    stvx    \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-.macro R V P
-    lvx     \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-;# r3 context_ptr
-    .align 2
-save_platform_contex:
-    W v20, r3
-    W v21, r3
-    W v22, r3
-    W v23, r3
-    W v24, r3
-    W v25, r3
-    W v26, r3
-    W v27, r3
-    W v28, r3
-    W v29, r3
-    W v30, r3
-    W v31, r3
-
-    blr
-
-;# r3 context_ptr
-    .align 2
-restore_platform_context:
-    R v20, r3
-    R v21, r3
-    R v22, r3
-    R v23, r3
-    R v24, r3
-    R v25, r3
-    R v26, r3
-    R v27, r3
-    R v28, r3
-    R v29, r3
-    R v30, r3
-    R v31, r3
-
-    blr

diff --git a/libvpx/vp8/common/ppc/recon_altivec.asm b/libvpx/vp8/common/ppc/recon_altivec.asm
deleted file mode 100644
index dd39e05..0000000
--- a/libvpx/vp8/common/ppc/recon_altivec.asm
+++ /dev/null

@@ -1,175 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl recon4b_ppc
-    .globl recon2b_ppc
-    .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
-    addi    \Pred, \Pred, 16        ;# next pred
-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff           ;# v3 = d0..d7
-    vaddshs v2, v2, v3              ;# v2 = r0..r7
-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff           ;# v3 = d8..d15
-    addi    \Diff, \Diff, 32        ;# next diff
-    vaddshs v3, v3, v1              ;# v3 = r8..r15
-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, \Dst            ;# to dst
-    add     \Dst, \Dst, \Stride     ;# next dst
-.endm
-
-    .text
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon4b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff       ;# v3 = d0..d7
-    vaddshs v2, v2, v3          ;# v2 = r0..r7
-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff       ;# v2 = d8..d15
-    vaddshs v3, v3, v1          ;# v3 = r8..r15
-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
-    stvx    v2,  0, r10         ;# 2 rows to dst from buf
-    lwz     r0, 0(r10)
-.if \write_first_four_pels
-    stw     r0, 0(\Dst)
-    .else
-    stwux   r0, \Dst, \Stride
-.endif
-    lwz     r0, 4(r10)
-    stw     r0, 4(\Dst)
-    lwz     r0, 8(r10)
-    stwux   r0, \Dst, \Stride       ;# advance dst to next row
-    lwz     r0, 12(r10)
-    stw     r0, 4(\Dst)
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-
-recon2b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    la      r10, -48(r1)                ;# buf
-
-    two_rows_of8 r3, r4, r5, r6, 1
-
-    addi    r4, r4, 16;                 ;# next pred
-    addi    r3, r3, 32;                 ;# next diff
-
-    two_rows_of8 r3, r4, r5, r6, 0
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro get_two_diff_rows
-    stw     r0, 0(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 4(r10)
-    lwzu    r0, 32(r3)
-    stw     r0, 8(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 12(r10)
-    lvx     v3, 0, r10
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon_b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-
-    la      r10, -48(r1)    ;# buf
-
-    lwz     r0, 0(r4)
-    stw     r0, 0(r10)
-    lwz     r0, 16(r4)
-    stw     r0, 4(r10)
-    lwz     r0, 32(r4)
-    stw     r0, 8(r10)
-    lwz     r0, 48(r4)
-    stw     r0, 12(r10)
-
-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
-
-    lwz r0, 0(r3)           ;# v3 = d0..d7
-
-    get_two_diff_rows
-
-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
-    vaddshs v2, v2, v3;     ;# v2 = r0..r7
-
-    lwzu r0, 32(r3)         ;# v3 = d8..d15
-
-    get_two_diff_rows
-
-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
-    vaddshs v3, v3, v1;     ;# v3 = r8..r15
-
-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
-
-    lwz     r0, 0(r10)
-    stw     r0, 0(r5)
-    lwz     r0, 4(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 8(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 12(r10)
-    stwx    r0, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr

diff --git a/libvpx/vp8/common/ppc/sad_altivec.asm b/libvpx/vp8/common/ppc/sad_altivec.asm
deleted file mode 100644
index e5f2638..0000000
--- a/libvpx/vp8/common/ppc/sad_altivec.asm
+++ /dev/null

@@ -1,277 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sad16x16_ppc
-    .globl vp8_sad16x8_ppc
-    .globl vp8_sad8x16_ppc
-    .globl vp8_sad8x8_ppc
-    .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v8, 0              ;# zero out total to start
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
-    lvsl    v3,  0, r5          ;# only needs to be done once per block
-
-    ;# preload a line of data before getting into the loop
-    lvx     v4, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    add     r5, r5, r6
-    add     r3, r3, r4
-
-    vperm   v5, v1, v2, v3
-
-    .align 4
-\loop_label:
-    ;# compute difference on first row
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-
-    ;# load up next set of data
-    lvx     v9, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    ;# perform abs() of difference
-    vor     v6, v6, v7
-    add     r3, r3, r4
-
-    ;# add to the running tally
-    vsum4ubs v8, v6, v8
-
-    ;# now onto the next line
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-    lvx     v4, 0, r3
-
-    ;# compute difference on second row
-    vsububs v6, v9, v5
-    lvx     v1,  0, r5
-    vsububs v7, v5, v9
-    lvx     v2, r10, r5
-    vor     v6, v6, v7
-    add     r3, r3, r4
-    vsum4ubs v8, v6, v8
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
-    .align 4
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v7, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v7
-
-    SAD_16
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_16_loop sad16x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_16_loop sad16x8_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_8_loop sad8x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_8_loop sad8x8_loop
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r7, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r7,  4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    vspltisw v8, 0              ;# zero out total to start
-
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v7, v6, v8
-    vsumsws v7, v7, v8
-
-    stvx    v7, 0, r1
-    lwz     r3, 12(r1)
-
-    epilogue
-
-    blr

diff --git a/libvpx/vp8/common/ppc/systemdependent.c b/libvpx/vp8/common/ppc/systemdependent.c
deleted file mode 100644
index 6899c0e..0000000
--- a/libvpx/vp8/common/ppc/systemdependent.c
+++ /dev/null

@@ -1,165 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_post_proc_down_and_across_mb_row)(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-
-extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp8_post_proc_down_and_across_mb_row_c
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp8_copy_mem16x16;
-extern copy_mem_block_function *vp8_copy_mem8x8;
-extern copy_mem_block_function *vp8_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp8_sixtap_predict_c;
-extern subpixel_predict_function vp8_sixtap_predict8x4_c;
-extern subpixel_predict_function vp8_sixtap_predict8x8_c;
-extern subpixel_predict_function vp8_sixtap_predict16x16_c;
-extern subpixel_predict_function vp8_bilinear_predict4x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x8_c;
-extern subpixel_predict_function vp8_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp8_copy_mem16x16_c;
-extern copy_mem_block_function vp8_copy_mem8x8_c;
-extern copy_mem_block_function vp8_copy_mem8x4_c;
-
-void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp8_loop_filter_mbv_c;
-extern loop_filter_block_function vp8_loop_filter_bv_c;
-extern loop_filter_block_function vp8_loop_filter_mbh_c;
-extern loop_filter_block_function vp8_loop_filter_bh_c;
-
-extern loop_filter_block_function vp8_loop_filter_mbvs_c;
-extern loop_filter_block_function vp8_loop_filter_bvs_c;
-extern loop_filter_block_function vp8_loop_filter_mbhs_c;
-extern loop_filter_block_function vp8_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp8_clear_c(void)
-{
-}
-
-void vp8_machine_specific_config(void)
-{
-    // Pure C:
-    vp8_clear_system_state                = vp8_clear_c;
-    vp8_recon_b                          = vp8_recon_b_c;
-    vp8_recon4b                         = vp8_recon4b_c;
-    vp8_recon2b                         = vp8_recon2b_c;
-
-    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
-    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
-    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
-    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
-
-    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
-    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
-    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
-    vp8_sixtap_predict                   = sixtap_predict_ppc;
-
-    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
-    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
-    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
-
-    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
-    vp8_lf_bvfull                        = loop_filter_bv_ppc;
-    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
-    vp8_lf_bhfull                        = loop_filter_bh_ppc;
-
-    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
-    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
-    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
-    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
-
-    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
-    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
-    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
-    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
-
-    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
-    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
-    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
-
-}

diff --git a/libvpx/vp8/common/ppc/variance_altivec.asm b/libvpx/vp8/common/ppc/variance_altivec.asm
deleted file mode 100644
index fb8d5bb..0000000
--- a/libvpx/vp8/common/ppc/variance_altivec.asm
+++ /dev/null

@@ -1,375 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_get8x8var_ppc
-    .globl vp8_get16x16var_ppc
-    .globl vp8_mse16x16_ppc
-    .globl vp8_variance16x16_ppc
-    .globl vp8_variance16x8_ppc
-    .globl vp8_variance8x16_ppc
-    .globl vp8_variance8x8_ppc
-    .globl vp8_variance4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v7, 0              ;# zero for merging
-    vspltisw v8, 0              ;# zero out total to start
-    vspltisw v9, 0              ;# zero out total for dif^2
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  v2, v7, v4
-    vmrghb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    vmrglb  v2, v7, v4
-    vmrglb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v0, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v0
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, get8x8var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, get16x16var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
-    prologue
-
-    mtctr   r10
-
-mse16x16_loop:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-
-    bdnz    mse16x16_loop
-
-    vsumsws v9, v9, v7
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stw     r3, 0(r7)           ;# sse
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x16_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, variance16x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x8_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_16 7, variance16x8_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_8 7, variance8x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, variance8x8_loop, 0
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r10,0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r10, 4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    compute_sum_sse
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
-
-    epilogue
-
-    blr

diff --git a/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm b/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
deleted file mode 100644
index 2308373..0000000
--- a/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
+++ /dev/null

@@ -1,865 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sub_pixel_variance4x4_ppc
-    .globl vp8_sub_pixel_variance8x8_ppc
-    .globl vp8_sub_pixel_variance8x16_ppc
-    .globl vp8_sub_pixel_variance16x8_ppc
-    .globl vp8_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r12, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r12, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r12, r0
-
-    ;# index to the next set of vectors in the row.
-    li      r12, 32
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  \t1, \z0, \src
-    vmrghb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    vmrglb  \t1, \z0, \src
-    vmrglb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    ;# Now compute sse.
-    vsububs \t1, \src, \ref
-    vsububs \t2, \ref, \src
-    vor     \t1, \t1, \t2
-
-    vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
-    vsumsws \sum, \sum, \z0
-    vsumsws \sse, \sse, \z0
-
-    stvx    \sum, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    \sse, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r9)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
-    load_and_align_16  v16, r7, r8, \increment_counter
-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
-    lvsl    v17,  0, \R         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, \R
-    lvx     v22, r10, \R
-
-.if \increment_counter
-    add     \R, \R, \P
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_4x4_b
-
-    hfilter_8 v4, v10, v11, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-compute_sum_sse_4x4_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    load_and_align_16 v4, r7, r8, 1
-    load_and_align_16 v5, r7, r8, 1
-    load_and_align_16 v6, r7, r8, 1
-    load_and_align_16 v7, r7, r8, 1
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_c v10, b_hilo_b, 0, r12, r0
-
-    vperm   v0, v0, v1, v10
-    vperm   v1, v2, v3, v10
-
-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 4
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-    hfilter_8 v4, v10, v11, 1
-    hfilter_8 v5, v10, v11, 1
-    hfilter_8 v6, v10, v11, 1
-    hfilter_8 v7, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x8_b
-
-    hfilter_8 v8, v10, v11, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 1
-    load_and_align_16 v5, r3, r4, 1
-    load_and_align_16 v6, r3, r4, 1
-    load_and_align_16 v7, r3, r4, 1
-    load_and_align_16 v8, r3, r4, 0
-
-    beq     compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0, v1
-    vfilter_16 v1, v2
-    vfilter_16 v2, v3
-    vfilter_16 v3, v4
-    vfilter_16 v4, v5
-    vfilter_16 v5, v6
-    vfilter_16 v6, v7
-    vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_and_align_16 v4,  r7, r8, 1
-    load_and_align_16 v5,  r7, r8, 1
-    load_and_align_16 v6,  r7, r8, 1
-    load_and_align_16 v7,  r7, r8, 1
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 0
-
-    vmrghb  v4, v4,  v5
-    vmrghb  v5, v6,  v7
-    vmrghb  v6, v8,  v9
-    vmrghb  v7, v10, v11
-
-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 6
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x16_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v29, b_0123_b, 0, r12, r0
-    load_c v30, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0,  v29, v30, 1
-    hfilter_8 v1,  v29, v30, 1
-    hfilter_8 v2,  v29, v30, 1
-    hfilter_8 v3,  v29, v30, 1
-    hfilter_8 v4,  v29, v30, 1
-    hfilter_8 v5,  v29, v30, 1
-    hfilter_8 v6,  v29, v30, 1
-    hfilter_8 v7,  v29, v30, 1
-    hfilter_8 v8,  v29, v30, 1
-    hfilter_8 v9,  v29, v30, 1
-    hfilter_8 v10, v29, v30, 1
-    hfilter_8 v11, v29, v30, 1
-    hfilter_8 v12, v29, v30, 1
-    hfilter_8 v13, v29, v30, 1
-    hfilter_8 v14, v29, v30, 1
-    hfilter_8 v15, v29, v30, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x16_b
-
-    hfilter_8 v16, v29, v30, 0
-
-    b   second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0,  r3, r4, 1
-    load_and_align_16 v1,  r3, r4, 1
-    load_and_align_16 v2,  r3, r4, 1
-    load_and_align_16 v3,  r3, r4, 1
-    load_and_align_16 v4,  r3, r4, 1
-    load_and_align_16 v5,  r3, r4, 1
-    load_and_align_16 v6,  r3, r4, 1
-    load_and_align_16 v7,  r3, r4, 1
-    load_and_align_16 v8,  r3, r4, 1
-    load_and_align_16 v9,  r3, r4, 1
-    load_and_align_16 v10, r3, r4, 1
-    load_and_align_16 v11, r3, r4, 1
-    load_and_align_16 v12, r3, r4, 1
-    load_and_align_16 v13, r3, r4, 1
-    load_and_align_16 v14, r3, r4, 1
-    load_and_align_16 v15, r3, r4, 1
-    load_and_align_16 v16, r3, r4, 0
-
-    beq     compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0,  v1
-    vmrghb  v1, v2,  v3
-    vmrghb  v2, v4,  v5
-    vmrghb  v3, v6,  v7
-    vmrghb  v4, v8,  v9
-    vmrghb  v5, v10, v11
-    vmrghb  v6, v12, v13
-    vmrghb  v7, v14, v15
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 1
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 0
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x8_pre_copy_b
-
-    hfilter_16 v0, 1
-    hfilter_16 v1, 1
-    hfilter_16 v2, 1
-    hfilter_16 v3, 1
-    hfilter_16 v4, 1
-    hfilter_16 v5, 1
-    hfilter_16 v6, 1
-    hfilter_16 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x8_b
-
-    hfilter_16 v8, 0
-
-    b   second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-
-    beq     compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-compute_sum_sse_16x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0, 1
-    compute_sum_sse_16 v1, 1
-    compute_sum_sse_16 v2, 1
-    compute_sum_sse_16 v3, 1
-    compute_sum_sse_16 v4, 1
-    compute_sum_sse_16 v5, 1
-    compute_sum_sse_16 v6, 1
-    compute_sum_sse_16 v7, 0
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-    load_and_align_16  v9,  r3, r4, 1
-    load_and_align_16  v10, r3, r4, 1
-    load_and_align_16  v11, r3, r4, 1
-    load_and_align_16  v12, r3, r4, 1
-    load_and_align_16  v13, r3, r4, 1
-    load_and_align_16  v14, r3, r4, 1
-    load_and_align_16  v15, r3, r4, 1
-    load_and_align_16  v16, r3, r4, 0
-
-    beq     compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0,  1
-    compute_sum_sse_16 v1,  1
-    compute_sum_sse_16 v2,  1
-    compute_sum_sse_16 v3,  1
-    compute_sum_sse_16 v4,  1
-    compute_sum_sse_16 v5,  1
-    compute_sum_sse_16 v6,  1
-    compute_sum_sse_16 v7,  1
-    compute_sum_sse_16 v8,  1
-    compute_sum_sse_16 v9,  1
-    compute_sum_sse_16 v10, 1
-    compute_sum_sse_16 v11, 1
-    compute_sum_sse_16 v12, 1
-    compute_sum_sse_16 v13, 1
-    compute_sum_sse_16 v14, 1
-    compute_sum_sse_16 v15, 0
-
-    variance_final v18, v19, v23, 8
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/libvpx/vp8/common/reconinter.c b/libvpx/vp8/common/reconinter.c
index bac3c94..e302595 100644
--- a/libvpx/vp8/common/reconinter.c
+++ b/libvpx/vp8/common/reconinter.c

@@ -10,6 +10,8 @@
 
 
 #include <limits.h>
+#include <string.h>
+
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -30,31 +32,8 @@
 
     for (r = 0; r < 16; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-        dst[8] = src[8];
-        dst[9] = src[9];
-        dst[10] = src[10];
-        dst[11] = src[11];
-        dst[12] = src[12];
-        dst[13] = src[13];
-        dst[14] = src[14];
-        dst[15] = src[15];
+        memcpy(dst, src, 16);
 
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-        ((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
-        ((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
-
-#endif
         src += src_stride;
         dst += dst_stride;
 
@@ -72,19 +51,8 @@
 
     for (r = 0; r < 8; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+        memcpy(dst, src, 8);
+
         src += src_stride;
         dst += dst_stride;
 
@@ -102,19 +70,8 @@
 
     for (r = 0; r < 4; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+        memcpy(dst, src, 8);
+
         src += src_stride;
         dst += dst_stride;
 

diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
index ec51ffe..0a6c51b 100644
--- a/libvpx/vp8/common/reconintra.c
+++ b/libvpx/vp8/common/reconintra.c

@@ -70,10 +70,10 @@
             expected_dc = 128;
         }
 
-        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+        /*memset(ypred_ptr, expected_dc, 256);*/
         for (r = 0; r < 16; r++)
         {
-            vpx_memset(ypred_ptr, expected_dc, 16);
+            memset(ypred_ptr, expected_dc, 16);
             ypred_ptr += y_stride;
         }
     }
@@ -98,7 +98,7 @@
         for (r = 0; r < 16; r++)
         {
 
-            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            memset(ypred_ptr, yleft_col[r], 16);
             ypred_ptr += y_stride;
         }
 
@@ -202,12 +202,12 @@
         }
 
 
-        /*vpx_memset(upred_ptr,expected_udc,64);*/
-        /*vpx_memset(vpred_ptr,expected_vdc,64);*/
+        /*memset(upred_ptr,expected_udc,64);*/
+        /*memset(vpred_ptr,expected_vdc,64);*/
         for (i = 0; i < 8; i++)
         {
-            vpx_memset(upred_ptr, expected_udc, 8);
-            vpx_memset(vpred_ptr, expected_vdc, 8);
+            memset(upred_ptr, expected_udc, 8);
+            memset(vpred_ptr, expected_vdc, 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }
@@ -217,8 +217,8 @@
     {
         for (i = 0; i < 8; i++)
         {
-            vpx_memcpy(upred_ptr, uabove_row, 8);
-            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            memcpy(upred_ptr, uabove_row, 8);
+            memcpy(vpred_ptr, vabove_row, 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }
@@ -229,8 +229,8 @@
     {
         for (i = 0; i < 8; i++)
         {
-            vpx_memset(upred_ptr, uleft_col[i], 8);
-            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            memset(upred_ptr, uleft_col[i], 8);
+            memset(vpred_ptr, vleft_col[i], 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }

diff --git a/libvpx/vp8/common/rtcd.c b/libvpx/vp8/common/rtcd.c
index 0b371b0..ab0e9b4 100644
--- a/libvpx/vp8/common/rtcd.c
+++ b/libvpx/vp8/common/rtcd.c

@@ -7,15 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-extern void vpx_scale_rtcd(void);
 
 void vp8_rtcd()
 {
-    vpx_scale_rtcd();
     once(setup_rtcd_internal);
 }

diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl
index 4aa0e0c..7924ae7 100644
--- a/libvpx/vp8/common/rtcd_defs.pl
+++ b/libvpx/vp8/common/rtcd_defs.pl

@@ -29,90 +29,90 @@
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx media neon/;
+specialize qw/vp8_dequantize_b mmx media neon msa/;
 $vp8_dequantize_b_media=vp8_dequantize_b_v6;
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
+specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/;
 $vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
 $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/;
 $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
-$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/;
 $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
-$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
 $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
-$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
 $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
 $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
-$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
 $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
-$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
-$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
 $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
 
 #
 # IDCT
 #
 #idct16
 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx media neon dspr2/;
+specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/;
 $vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
 $vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
 
@@ -124,13 +124,13 @@
 
 #iwalsh16
 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2/;
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/;
 $vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
 $vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2/;
+specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2 msa/;
 $vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
 $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
 
@@ -138,26 +138,25 @@
 # RECON
 #
 add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/;
+specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/;
 $vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
 $vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
 
 add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/;
+specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/;
 $vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
 $vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
 
 add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/;
+specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
 $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
 $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
 
 add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
-#TODO: fix assembly for neon
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon msa/;
 
 add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon msa/;
 
 add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
 specialize qw/vp8_intra4x4_predict media/;
@@ -168,18 +167,18 @@
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
     add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vp8_mbpost_proc_down mmx sse2/;
+    specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
     $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
 
     add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vp8_mbpost_proc_across_ip sse2/;
+    specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
     $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
 
     add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vp8_post_proc_down_and_across_mb_row sse2/;
+    specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
 
     add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
-    specialize qw/vp8_plane_add_noise mmx sse2/;
+    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
     $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
 
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
@@ -192,10 +191,10 @@
     # no asm yet
 
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
-    specialize qw/vp8_filter_by_weight16x16 sse2/;
+    specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
 
     add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
-    specialize qw/vp8_filter_by_weight8x8 sse2/;
+    specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
 
     add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     # no asm yet
@@ -205,318 +204,95 @@
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2/;
+specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/;
 $vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
 $vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2/;
+specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/;
 $vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
 $vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2/;
+specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/;
 $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-# Disable neon while investigating https://code.google.com/p/webm/issues/detail?id=817
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2/;
+#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
 $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon/;
+specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/;
 $vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
 
 add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon/;
+specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/;
 $vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
 
 add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx media neon/;
+specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
 $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
 
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx media/;
+#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
+specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 
 #
-# Whole-pixel Variance
-#
-add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance4x4 mmx sse2/;
-$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x8 mmx sse2 media neon/;
-$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
-$vp8_variance8x8_media=vp8_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x16 mmx sse2 neon/;
-$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x8 mmx sse2 neon/;
-$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x16 mmx sse2 media neon/;
-$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
-$vp8_variance16x16_media=vp8_variance16x16_armv6;
-
-#
-# Sub-pixel Variance
-#
-add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
-$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon_asm/;
-$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
-$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
-$vp8_sub_pixel_variance8x8_neon_asm=vp8_sub_pixel_variance8x8_neon;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
-$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
-$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
-$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
-$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
-$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon_asm/;
-$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
-$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-$vp8_variance_halfpixvar16x16_h_neon_asm=vp8_variance_halfpixvar16x16_h_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon_asm/;
-$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
-$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-$vp8_variance_halfpixvar16x16_v_neon_asm=vp8_variance_halfpixvar16x16_v_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon_asm/;
-$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
-$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-$vp8_variance_halfpixvar16x16_hv_neon_asm=vp8_variance_halfpixvar16x16_hv_neon;
-
-#
-# Single block SAD
-#
-add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad4x4 mmx sse2 neon/;
-$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
-
-add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x8 mmx sse2 neon/;
-$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
-
-add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x16 mmx sse2 neon/;
-$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
-
-add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x8 mmx sse2 neon/;
-$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
-
-add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
-$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
-$vp8_sad16x16_media=vp8_sad16x16_armv6;
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x3 sse3/;
-
-add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x3 sse3/;
-
-add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x3 sse3/;
-
-add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x3 sse3 ssse3/;
-
-# Note the only difference in the following prototypes is that they return into
-# an array of short
-add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad4x4x8 sse4_1/;
-$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
-
-add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x8x8 sse4_1/;
-$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
-
-add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x16x8 sse4_1/;
-$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
-
-add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x8x8 sse4_1/;
-$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
-
-add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x16x8 sse4_1/;
-$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x4d sse3/;
-
-add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x4d sse3/;
-
-add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x4d sse3/;
-
-add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x4d sse3/;
-
-add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x4d sse3/;
-
-#
 # Encoder functions below this point.
 #
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
 
 #
-# Sum of squares (vector)
-#
-add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
-specialize qw/vp8_get_mb_ss mmx sse2/;
-
-#
-# SSE (Sum Squared Error)
-#
-add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
-$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
-
-add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_mse16x16 mmx sse2 media neon_asm/;
-$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
-$vp8_mse16x16_media=vp8_mse16x16_armv6;
-$vp8_mse16x16_neon_asm=vp8_mse16x16_neon;
-
-add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-specialize qw/vp8_get4x4sse_cs mmx neon_asm/;
-$vp8_get4x4sse_cs_neon_asm=vp8_get4x4sse_cs_neon;
-
-#
 # Block copy
 #
 if ($opts{arch} =~ /x86/) {
-    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n";
+    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n";
     specialize qw/vp8_copy32xn sse2 sse3/;
 }
 
 #
-# Structured Similarity (SSIM)
-#
-if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    $opts{arch} eq "x86_64" and $sse2_on_x86_64 = "sse2";
-
-    add_proto qw/void vp8_ssim_parms_8x8/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp8_ssim_parms_8x8/, "$sse2_on_x86_64";
-
-    add_proto qw/void vp8_ssim_parms_16x16/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp8_ssim_parms_16x16/, "$sse2_on_x86_64";
-}
-
-#
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/;
 $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
-$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/;
 $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
-$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
+specialize qw/vp8_short_walsh4x4 sse2 media neon msa/;
 $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
-$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 
 #
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
-$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
-$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
-
-add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-# no asm yet
-
-add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-specialize qw/vp8_fast_quantize_b_pair neon_asm/;
-$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
-
-add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
-specialize qw/vp8_quantize_mb neon/;
-
-add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
-specialize qw/vp8_quantize_mby neon/;
-
-add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
-specialize qw/vp8_quantize_mbuv neon/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
 
 #
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error mmx sse2/;
+specialize qw/vp8_block_error mmx sse2 msa/;
 $vp8_block_error_sse2=vp8_block_error_xmm;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error mmx sse2/;
+specialize qw/vp8_mbblock_error mmx sse2 msa/;
 $vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
-specialize qw/vp8_mbuverror mmx sse2/;
+specialize qw/vp8_mbuverror mmx sse2 msa/;
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
-add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
-$vp8_subtract_b_media=vp8_subtract_b_armv6;
-$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
-
-add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
-$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
-$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
-
-add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
-$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
-$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
-
 #
 # Motion search
 #
@@ -537,25 +313,17 @@
 #
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
-    specialize qw/vp8_temporal_filter_apply sse2/;
+    specialize qw/vp8_temporal_filter_apply sse2 msa/;
 }
 
 #
-# Pick Loopfilter
-#
-add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
-$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
-
-#
 # Denoiser filter
 #
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
     add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
-    specialize qw/vp8_denoiser_filter sse2 neon/;
+    specialize qw/vp8_denoiser_filter sse2 neon msa/;
     add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
-    specialize qw/vp8_denoiser_filter_uv sse2 neon/;
-
+    specialize qw/vp8_denoiser_filter_uv sse2 neon msa/;
 }
 
 # End of encoder only functions

diff --git a/libvpx/vp8/common/sad_c.c b/libvpx/vp8/common/sad_c.c
deleted file mode 100644
index 5f36fc9..0000000
--- a/libvpx/vp8/common/sad_c.c
+++ /dev/null

@@ -1,302 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int max_sad, int m, int n)
-{
-    int r, c;
-    unsigned int sad = 0;
-
-    for (r = 0; r < n; r++)
-    {
-        for (c = 0; c < m; c++)
-        {
-            sad += abs(src_ptr[c] - ref_ptr[c]);
-        }
-
-        if (sad > max_sad)
-          break;
-
-        src_ptr += src_stride;
-        ref_ptr += ref_stride;
-    }
-
-    return sad;
-}
-
-/* max_sad is provided as an optional optimization point. Alternative
- * implementations of these functions are not required to check it.
- */
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
-                            const unsigned char *ref_ptr, int ref_stride,
-                            unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
-}
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
-                          const unsigned char *ref_ptr, int ref_stride,
-                          unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
-}
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
-                           const unsigned char *ref_ptr, int ref_stride,
-                           unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
-
-}
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
-                           const unsigned char *ref_ptr, int ref_stride,
-                           unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
-}
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
-                          const unsigned char *ref_ptr, int ref_stride,
-                          unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
-}
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char *ref_ptr, int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char *ref_ptr, int ref_stride,
-                      unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
-                       const unsigned char * const ref_ptr[], int ref_stride,
-                       unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char * const ref_ptr[], int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char * const ref_ptr[], int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char * const ref_ptr[], int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char * const ref_ptr[], int  ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-/* Copy 2 macroblocks to a buffer */
-void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
-                    unsigned char *dst_ptr, int dst_stride,
-                    int height)
-{
-    int r;
-
-    for (r = 0; r < height; r++)
-    {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst_ptr[0] = src_ptr[0];
-        dst_ptr[1] = src_ptr[1];
-        dst_ptr[2] = src_ptr[2];
-        dst_ptr[3] = src_ptr[3];
-        dst_ptr[4] = src_ptr[4];
-        dst_ptr[5] = src_ptr[5];
-        dst_ptr[6] = src_ptr[6];
-        dst_ptr[7] = src_ptr[7];
-        dst_ptr[8] = src_ptr[8];
-        dst_ptr[9] = src_ptr[9];
-        dst_ptr[10] = src_ptr[10];
-        dst_ptr[11] = src_ptr[11];
-        dst_ptr[12] = src_ptr[12];
-        dst_ptr[13] = src_ptr[13];
-        dst_ptr[14] = src_ptr[14];
-        dst_ptr[15] = src_ptr[15];
-        dst_ptr[16] = src_ptr[16];
-        dst_ptr[17] = src_ptr[17];
-        dst_ptr[18] = src_ptr[18];
-        dst_ptr[19] = src_ptr[19];
-        dst_ptr[20] = src_ptr[20];
-        dst_ptr[21] = src_ptr[21];
-        dst_ptr[22] = src_ptr[22];
-        dst_ptr[23] = src_ptr[23];
-        dst_ptr[24] = src_ptr[24];
-        dst_ptr[25] = src_ptr[25];
-        dst_ptr[26] = src_ptr[26];
-        dst_ptr[27] = src_ptr[27];
-        dst_ptr[28] = src_ptr[28];
-        dst_ptr[29] = src_ptr[29];
-        dst_ptr[30] = src_ptr[30];
-        dst_ptr[31] = src_ptr[31];
-#else
-        ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
-        ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
-        ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
-        ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
-        ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
-        ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
-        ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
-        ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
-#endif
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-
-    }
-}

diff --git a/libvpx/vp8/common/setupintrarecon.c b/libvpx/vp8/common/setupintrarecon.c
index 60afe51..669564d 100644
--- a/libvpx/vp8/common/setupintrarecon.c
+++ b/libvpx/vp8/common/setupintrarecon.c

@@ -17,15 +17,15 @@
     int i;
 
     /* set up frame new frame for intra coded blocks */
-    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
     for (i = 0; i < ybf->y_height; i++)
         ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
 
-    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
     for (i = 0; i < ybf->uv_height; i++)
         ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
 
-    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
     for (i = 0; i < ybf->uv_height; i++)
         ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
 
@@ -33,7 +33,7 @@
 
 void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
 {
-    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
 }

diff --git a/libvpx/vp8/common/variance.h b/libvpx/vp8/common/variance.h
deleted file mode 100644
index 89a32a7..0000000
--- a/libvpx/vp8/common/variance.h
+++ /dev/null

@@ -1,123 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_COMMON_VARIANCE_H_
-#define VP8_COMMON_VARIANCE_H_
-
-#include "vpx_config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vp8_sad_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int ref_stride,
-    unsigned int max_sad);
-
-typedef void (*vp8_copy32xn_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int ref_stride,
-    int n);
-
-typedef void (*vp8_sad_multi_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array);
-
-typedef void (*vp8_sad_multi1_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride,
-     unsigned short *sad_array
-    );
-
-typedef void (*vp8_sad_multi_d_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char * const ref_ptr[],
-     int  ref_stride,
-     unsigned int *sad_array
-    );
-
-typedef unsigned int (*vp8_variance_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride,
-     unsigned int *sse
-    );
-
-typedef unsigned int (*vp8_subpixvariance_fn_t)
-    (
-      const unsigned char  *src_ptr,
-      int  source_stride,
-      int  xoffset,
-      int  yoffset,
-      const unsigned char *ref_ptr,
-      int Refstride,
-      unsigned int *sse
-    );
-
-typedef void (*vp8_ssimpf_fn_t)
-      (
-        unsigned char *s,
-        int sp,
-        unsigned char *r,
-        int rp,
-        unsigned long *sum_s,
-        unsigned long *sum_r,
-        unsigned long *sum_sq_s,
-        unsigned long *sum_sq_r,
-        unsigned long *sum_sxr
-      );
-
-typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp8_get16x16prederror_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride
-    );
-
-typedef struct variance_vtable
-{
-    vp8_sad_fn_t            sdf;
-    vp8_variance_fn_t       vf;
-    vp8_subpixvariance_fn_t svf;
-    vp8_variance_fn_t       svf_halfpix_h;
-    vp8_variance_fn_t       svf_halfpix_v;
-    vp8_variance_fn_t       svf_halfpix_hv;
-    vp8_sad_multi_fn_t      sdx3f;
-    vp8_sad_multi1_fn_t     sdx8f;
-    vp8_sad_multi_d_fn_t    sdx4df;
-#if ARCH_X86 || ARCH_X86_64
-    vp8_copy32xn_fn_t       copymem;
-#endif
-} vp8_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_COMMON_VARIANCE_H_

diff --git a/libvpx/vp8/common/variance_c.c b/libvpx/vp8/common/variance_c.c
deleted file mode 100644
index 773b655..0000000
--- a/libvpx/vp8/common/variance_c.c
+++ /dev/null

@@ -1,458 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "filter.h"
-
-
-unsigned int vp8_get_mb_ss_c
-(
-    const short *src_ptr
-)
-{
-    unsigned int i = 0, sum = 0;
-
-    do
-    {
-        sum += (src_ptr[i] * src_ptr[i]);
-        i++;
-    }
-    while (i < 256);
-
-    return sum;
-}
-
-
-static void variance(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    int  w,
-    int  h,
-    unsigned int *sse,
-    int *sum)
-{
-    int i, j;
-    int diff;
-
-    *sum = 0;
-    *sse = 0;
-
-    for (i = 0; i < h; i++)
-    {
-        for (j = 0; j < w; j++)
-        {
-            diff = src_ptr[j] - ref_ptr[j];
-            *sum += diff;
-            *sse += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-}
-
-
-unsigned int vp8_variance16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance8x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vp8_variance16x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-
-unsigned int vp8_variance8x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vp8_variance4x4_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-
-unsigned int vp8_mse16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return var;
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass
-(
-    const unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply filter */
-            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-
-unsigned int vp8_sub_pixel_variance4x4_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    /* First filter 1d Horizontal */
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
-    /* Now filter Verticaly */
-    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
-
-    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
-    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
-    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_sub_pixel_mse16x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-    return *sse;
-}
-
-unsigned int vp8_sub_pixel_variance16x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
-    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance8x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
-    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}

diff --git a/libvpx/vp8/common/loopfilter.c b/libvpx/vp8/common/vp8_loopfilter.c
similarity index 98%
rename from libvpx/vp8/common/loopfilter.c
rename to libvpx/vp8/common/vp8_loopfilter.c
index 7a07e76..8b55dff 100644
--- a/libvpx/vp8/common/loopfilter.c
+++ b/libvpx/vp8/common/vp8_loopfilter.c

@@ -82,11 +82,10 @@
         if (block_inside_limit < 1)
             block_inside_limit = 1;
 
-        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
-        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
-                SIMD_WIDTH);
-        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
-                SIMD_WIDTH);
+        memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+        memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+               SIMD_WIDTH);
     }
 }
 
@@ -105,7 +104,7 @@
     /* init hev threshold const vectors */
     for(i = 0; i < 4 ; i++)
     {
-        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+        memset(lfi->hev_thr[i], i, SIMD_WIDTH);
     }
 }
 
@@ -151,7 +150,7 @@
             /* we could get rid of this if we assume that deltas are set to
              * zero when not in use; encoder always uses deltas
              */
-            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
             continue;
         }
 
@@ -261,6 +260,7 @@
     int mb_col;
     int filter_level;
     loop_filter_info_n *lfi_n = &cm->lf_info;
+    (void)post_uvstride;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {

diff --git a/libvpx/vp8/common/x86/copy_sse2.asm b/libvpx/vp8/common/x86/copy_sse2.asm
new file mode 100644
index 0000000..86fae26
--- /dev/null
+++ b/libvpx/vp8/common/x86/copy_sse2.asm

@@ -0,0 +1,93 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             .block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              .copy_is_done
+
+.block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             .block_copy_sse2_loop
+
+.copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/libvpx/vp8/common/x86/copy_sse3.asm b/libvpx/vp8/common/x86/copy_sse3.asm
new file mode 100644
index 0000000..d789a40
--- /dev/null
+++ b/libvpx/vp8/common/x86/copy_sse3.asm

@@ -0,0 +1,146 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     max_sad       arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %if LIBVPX_YASM_WIN64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     max_sad     r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     max_sad
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %if LIBVPX_YASM_WIN64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+
+;void vp8_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             .block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              .copy_is_done
+
+.block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             .block_copy_sse3_loop
+
+.copy_is_done:
+    STACK_FRAME_DESTROY_X3

diff --git a/libvpx/vp8/common/x86/idct_blk_mmx.c b/libvpx/vp8/common/x86/idct_blk_mmx.c
index a1e4ce6..f2532b3 100644
--- a/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ b/libvpx/vp8/common/x86/idct_blk_mmx.c

@@ -36,7 +36,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -45,7 +45,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
                                       dst+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[2] > 1)
@@ -54,7 +54,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
                                       dst+8, stride);
-            vpx_memset(q + 32, 0, 2 * sizeof(q[0]));
+            memset(q + 32, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[3] > 1)
@@ -63,7 +63,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
                                       dst+12, stride);
-            vpx_memset(q + 48, 0, 2 * sizeof(q[0]));
+            memset(q + 48, 0, 2 * sizeof(q[0]));
         }
 
         q    += 64;
@@ -85,7 +85,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -94,7 +94,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
                                       dstu+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;
@@ -109,7 +109,7 @@
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -118,7 +118,7 @@
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
                                       dstv+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;

diff --git a/libvpx/vp8/common/x86/sad_mmx.asm b/libvpx/vp8/common/x86/sad_mmx.asm
deleted file mode 100644
index 592112f..0000000
--- a/libvpx/vp8/common/x86/sad_mmx.asm
+++ /dev/null

@@ -1,427 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp8_sad16x16_mmx) PRIVATE
-global sym(vp8_sad8x16_mmx) PRIVATE
-global sym(vp8_sad8x8_mmx) PRIVATE
-global sym(vp8_sad4x4_mmx) PRIVATE
-global sym(vp8_sad16x8_mmx) PRIVATE
-
-;unsigned int vp8_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp8_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp8_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp8_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp8_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp8_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp8/common/x86/sad_sse2.asm b/libvpx/vp8/common/x86/sad_sse2.asm
deleted file mode 100644
index 8d86abc..0000000
--- a/libvpx/vp8/common/x86/sad_sse2.asm
+++ /dev/null

@@ -1,410 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp8_sad16x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad16x16_wmt) PRIVATE
-sym(vp8_sad16x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            xmm6,       xmm6
-
-.x16x16sad_wmt_loop:
-
-        movq            xmm0,       QWORD PTR [rsi]
-        movq            xmm2,       QWORD PTR [rsi+8]
-
-        movq            xmm1,       QWORD PTR [rdi]
-        movq            xmm3,       QWORD PTR [rdi+8]
-
-        movq            xmm4,       QWORD PTR [rsi+rax]
-        movq            xmm5,       QWORD PTR [rdi+rdx]
-
-
-        punpcklbw       xmm0,       xmm2
-        punpcklbw       xmm1,       xmm3
-
-        psadbw          xmm0,       xmm1
-        movq            xmm2,       QWORD PTR [rsi+rax+8]
-
-        movq            xmm3,       QWORD PTR [rdi+rdx+8]
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        punpcklbw       xmm4,       xmm2
-
-        punpcklbw       xmm5,       xmm3
-        psadbw          xmm4,       xmm5
-
-        paddw           xmm6,       xmm0
-        paddw           xmm6,       xmm4
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_wmt_loop
-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movq            rax,        xmm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_sad8x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_sad)
-global sym(vp8_sad8x16_wmt) PRIVATE
-sym(vp8_sad8x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-
-        lea             rcx,        [rcx+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x16sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x8x16sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        QWORD PTR [rsi+rbx]
-        movq            mm3,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm2
-
-        cmp             rsi,        rcx
-        jne             .x8x16sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x8x16sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad8x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad8x8_wmt) PRIVATE
-sym(vp8_sad8x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x8x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rbx]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x8x8sad_wmt_loop
-
-        movq            rax,        mm7
-.x8x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_sad4x4_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad4x4_wmt) PRIVATE
-sym(vp8_sad4x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        movd            mm4,        DWORD PTR [rsi]
-
-        movd            mm5,        DWORD PTR [rdi]
-        movd            mm6,        DWORD PTR [rsi+rax]
-
-        movd            mm7,        DWORD PTR [rdi+rdx]
-        punpcklbw       mm4,        mm6
-
-        punpcklbw       mm5,        mm7
-        psadbw          mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            rax,        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad16x8_wmt) PRIVATE
-sym(vp8_sad16x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x16x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x16x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        QWORD PTR [rsi+rbx]
-        movq            mm5,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [rsi+rbx+8]
-        movq            mm3,        QWORD PTR [rdi+rdx+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x16x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_copy32xn_sse2(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp8_copy32xn_sse2) PRIVATE
-sym(vp8_copy32xn_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;dst_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;dst_stride
-        movsxd          rcx,        dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,        [rsi+rax*2]
-
-        movdqu          xmm4,       XMMWORD PTR [rsi]
-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,    [rsi+rax*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        movdqa          XMMWORD PTR [rdi + rdx], xmm2
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
-
-        lea             rdi,    [rdi+rdx*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm4
-        movdqa          XMMWORD PTR [rdi + 16], xmm5
-        movdqa          XMMWORD PTR [rdi + rdx], xmm6
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
-
-        lea             rdi,    [rdi+rdx*2]
-
-        sub             rcx,     4
-        cmp             rcx,     4
-        jge             .block_copy_sse2_loopx4
-
-        cmp             rcx, 0
-        je              .copy_is_done
-
-.block_copy_sse2_loop:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        lea             rsi,    [rsi+rax]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        lea             rdi,    [rdi+rdx]
-
-        sub             rcx,     1
-        jne             .block_copy_sse2_loop
-
-.copy_is_done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp8/common/x86/sad_sse3.asm b/libvpx/vp8/common/x86/sad_sse3.asm
deleted file mode 100644
index 69c8d37..0000000
--- a/libvpx/vp8/common/x86/sad_sse3.asm
+++ /dev/null

@@ -1,960 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     max_sad       arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     max_sad     r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     max_sad
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     r0_ptr        rcx
-  %define     r1_ptr        rdx
-  %define     r2_ptr        rbx
-  %define     r3_ptr        rdi
-  %define     ref_stride    rbp
-  %define     result_ptr    arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    push        rbp
-    mov         rdi,        arg(2)              ; ref_ptr_base
-
-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
-    mov         rsi,        arg(0)              ; src_ptr
-
-    movsxd      rbx,        dword ptr arg(1)    ; src_stride
-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
-
-    xchg        rbx,        rax
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     r0_ptr      rsi
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      r8
-    %define     ref_stride  r9
-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
-    push        rsi
-
-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     r0_ptr      r9
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      rdx
-    %define     ref_stride  rcx
-    %define     result_ptr  r8
-
-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
-  %define     src_ptr
-  %define     src_stride
-  %define     r0_ptr
-  %define     r1_ptr
-  %define     r2_ptr
-  %define     r3_ptr
-  %define     ref_stride
-  %define     result_ptr
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-%macro LOAD_X4_ADDRESSES 5
-        mov             %2,         [%1+REG_SZ_BYTES*0]
-        mov             %3,         [%1+REG_SZ_BYTES*1]
-
-        mov             %4,         [%1+REG_SZ_BYTES*2]
-        mov             %5,         [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm4,       XMMWORD PTR [%3]
-        lddqu           xmm5,       XMMWORD PTR [%4]
-        lddqu           xmm6,       XMMWORD PTR [%5]
-        lddqu           xmm7,       XMMWORD PTR [%6]
-
-        psadbw          xmm4,       xmm0
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%4]
-        lddqu           xmm3,       XMMWORD PTR [%5]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%7]
-        lddqu           xmm1,       XMMWORD PTR [%3+%8]
-        lddqu           xmm2,       XMMWORD PTR [%4+%8]
-        lddqu           xmm3,       XMMWORD PTR [%5+%8]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6+%8]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm4,        QWORD PTR [%3]
-        movq            mm5,        QWORD PTR [%4]
-        movq            mm6,        QWORD PTR [%5]
-        movq            mm7,        QWORD PTR [%6]
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-        psadbw          mm7,        mm0
-%else
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm1,        QWORD PTR [%3]
-        movq            mm2,        QWORD PTR [%4]
-        movq            mm3,        QWORD PTR [%5]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-%endif
-        movq            mm0,        QWORD PTR [%2+%7]
-        movq            mm1,        QWORD PTR [%3+%8]
-        movq            mm2,        QWORD PTR [%4+%8]
-        movq            mm3,        QWORD PTR [%5+%8]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6+%8]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-
-%endmacro
-
-;void int vp8_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x16x3_sse3) PRIVATE
-sym(vp8_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x8x3_sse3) PRIVATE
-sym(vp8_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x16x3_sse3) PRIVATE
-sym(vp8_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x8x3_sse3) PRIVATE
-sym(vp8_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad4x4x3_sse3) PRIVATE
-sym(vp8_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;unsigned int vp8_sad16x16_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_sad)
-;%define lddqu movdqu
-global sym(vp8_sad16x16_sse3) PRIVATE
-sym(vp8_sad16x16_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        mov             end_ptr,    4
-        pxor            xmm7,        xmm7
-
-.vp8_sad16x16_sse3_loop:
-        movdqa          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          xmm4,       XMMWORD PTR [src_ptr]
-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
-
-        psadbw          xmm0,       xmm1
-
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          xmm2,       xmm3
-        psadbw          xmm4,       xmm5
-        psadbw          xmm6,       xmm1
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        paddw           xmm7,        xmm0
-        paddw           xmm7,        xmm2
-        paddw           xmm7,        xmm4
-        paddw           xmm7,        xmm6
-
-        sub             end_ptr,     1
-        jne             .vp8_sad16x16_sse3_loop
-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-        paddw           xmm0,       xmm7
-        movq            rax,        xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void vp8_copy32xn_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp8_copy32xn_sse3) PRIVATE
-sym(vp8_copy32xn_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
-        lea             end_ptr,    [src_ptr+src_stride*2]
-
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
-        movdqu          xmm4,       XMMWORD PTR [end_ptr]
-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
-
-        lea             src_ptr,    [src_ptr+src_stride*4]
-
-        lea             end_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
-        movdqa          XMMWORD PTR [end_ptr], xmm4
-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
-        lea             ref_ptr,    [ref_ptr+ref_stride*4]
-
-        sub             height,     4
-        cmp             height,     4
-        jge             .block_copy_sse3_loopx4
-
-        ;Check to see if there is more rows need to be copied.
-        cmp             height, 0
-        je              .copy_is_done
-
-.block_copy_sse3_loop:
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        lea             src_ptr,    [src_ptr+src_stride]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        lea             ref_ptr,    [ref_ptr+ref_stride]
-
-        sub             height,     1
-        jne             .block_copy_sse3_loop
-
-.copy_is_done:
-    STACK_FRAME_DESTROY_X3
-
-;void vp8_sad16x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x16x4d_sse3) PRIVATE
-sym(vp8_sad16x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void vp8_sad16x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x8x4d_sse3) PRIVATE
-sym(vp8_sad16x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x16x4d_sse3) PRIVATE
-sym(vp8_sad8x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x8x4d_sse3) PRIVATE
-sym(vp8_sad8x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad4x4x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad4x4x4d_sse3) PRIVATE
-sym(vp8_sad4x4x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [r0_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [r1_ptr]
-        movd            mm5,        DWORD PTR [r2_ptr]
-
-        movd            mm6,        DWORD PTR [r3_ptr]
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-
-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        punpcklbw       mm6,        mm7
-        psadbw          mm4,        mm0
-
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-
-
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             r0_ptr,     [r0_ptr+ref_stride*2]
-
-        lea             r1_ptr,     [r1_ptr+ref_stride*2]
-        lea             r2_ptr,     [r2_ptr+ref_stride*2]
-
-        lea             r3_ptr,     [r3_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [r0_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm7
-
-        movd            mm3,        DWORD PTR [r1_ptr]
-        movd            mm7,        DWORD PTR [r2_ptr]
-
-        psadbw          mm2,        mm0
-%if ABI_IS_32BIT
-        mov             rax,        rbp
-
-        pop             rbp
-%define     ref_stride    rax
-%endif
-        mov             rsi,        result_ptr
-
-        paddw           mm1,        mm2
-        movd            [rsi],      mm1
-
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm1
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        movd            mm2,        DWORD PTR [r3_ptr]
-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        movd            [rsi+4],    mm3
-        punpcklbw       mm2,        mm1
-
-        movd            [rsi+8],    mm7
-        psadbw          mm2,        mm0
-
-        paddw           mm2,        mm6
-        movd            [rsi+12],   mm2
-
-
-    STACK_FRAME_DESTROY_X4
-

diff --git a/libvpx/vp8/common/x86/sad_sse4.asm b/libvpx/vp8/common/x86/sad_sse4.asm
deleted file mode 100644
index f7fccd7..0000000
--- a/libvpx/vp8/common/x86/sad_sse4.asm
+++ /dev/null

@@ -1,353 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-
-;void vp8_sad16x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(vp8_sad16x16x8_sse4) PRIVATE
-sym(vp8_sad16x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad16x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad16x8x8_sse4) PRIVATE
-sym(vp8_sad16x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x8x8_sse4) PRIVATE
-sym(vp8_sad8x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x16x8_sse4) PRIVATE
-sym(vp8_sad8x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad4x4x8_c(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad4x4x8_sse4) PRIVATE
-sym(vp8_sad4x4x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-

diff --git a/libvpx/vp8/common/x86/sad_ssse3.asm b/libvpx/vp8/common/x86/sad_ssse3.asm
deleted file mode 100644
index 278fc06..0000000
--- a/libvpx/vp8/common/x86/sad_ssse3.asm
+++ /dev/null

@@ -1,370 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int vp8_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x16x3_ssse3) PRIVATE
-sym(vp8_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp8_sad16x16x3_ssse3_skiptable
-.vp8_sad16x16x3_ssse3_jumptable:
-        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_skiptable:
-
-        call .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
-
-.vp8_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp8_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vp8_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x8x3_ssse3) PRIVATE
-sym(vp8_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp8_sad16x8x3_ssse3_skiptable
-.vp8_sad16x8x3_ssse3_jumptable:
-        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_skiptable:
-
-        call .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
-
-.vp8_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp8_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp8/common/x86/variance_impl_sse2.asm b/libvpx/vp8/common/x86/variance_impl_sse2.asm
deleted file mode 100644
index 761433c..0000000
--- a/libvpx/vp8/common/x86/variance_impl_sse2.asm
+++ /dev/null

@@ -1,1359 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-;unsigned int vp8_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp8_get_mb_ss_sse2) PRIVATE
-sym(vp8_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get16x16var_sse2) PRIVATE
-sym(vp8_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp8_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get8x8var_sse2) PRIVATE
-sym(vp8_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp8_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-vp8_half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-vp8_half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-vp8_half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             vp8_half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance8x_h_1        ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-vp8_half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x_h_1        ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

diff --git a/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
deleted file mode 100644
index 686b4a9..0000000
--- a/libvpx/vp8/common/x86/variance_impl_ssse3.asm
+++ /dev/null

@@ -1,364 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp8_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp8_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 112, 16
-    times 8 db 96,  32
-    times 8 db 80,  48
-    times 8 db 64,  64
-    times 8 db 48,  80
-    times 8 db 32,  96
-    times 8 db 16,  112

diff --git a/libvpx/vp8/common/x86/variance_mmx.c b/libvpx/vp8/common/x86/variance_mmx.c
deleted file mode 100644
index 02e0242..0000000
--- a/libvpx/vp8/common/x86/variance_mmx.c
+++ /dev/null

@@ -1,397 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-extern void filter_block1d_v6_mmx
-(
-    const short *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-
-extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp8_get8x8var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-
-unsigned int vp8_variance4x4_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-unsigned int vp8_mse16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-
-unsigned int vp8_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance16x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_sub_pixel_variance4x4_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse)
-
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp8_sub_pixel_mse16x16_mmx(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-    return *sse;
-}
-
-unsigned int vp8_sub_pixel_variance16x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
-                                           ref_ptr, recon_stride, sse);
-}

diff --git a/libvpx/vp8/common/x86/variance_sse2.c b/libvpx/vp8/common/x86/variance_sse2.c
deleted file mode 100644
index 1fe127b..0000000
--- a/libvpx/vp8/common/x86/variance_sse2.c
+++ /dev/null

@@ -1,557 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-
-unsigned int vp8_get_mb_ss_sse2
-(
-    const short *src_ptr
-);
-unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-unsigned int vp8_get8x8var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-void vp8_filter_block2d_bil_var_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_variance4x4_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-
-unsigned int vp8_variance16x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0;
-    int sum0;
-
-
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
-}
-unsigned int vp8_mse16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-
-    unsigned int sse0;
-    int sum0;
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return sse0;
-
-}
-
-
-unsigned int vp8_variance16x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-unsigned int vp8_variance8x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-unsigned int vp8_sub_pixel_variance4x4_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0
-        );
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum1, &xxsum1
-        );
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_mse16x16_wmt(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-    return *sse;
-}
-
-unsigned int vp8_sub_pixel_variance16x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum1, &xxsum1);
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}

diff --git a/libvpx/vp8/common/x86/variance_ssse3.c b/libvpx/vp8/common/x86/variance_ssse3.c
deleted file mode 100644
index 73eb90d..0000000
--- a/libvpx/vp8/common/x86/variance_ssse3.c
+++ /dev/null

@@ -1,165 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-
-extern unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-extern void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_ssse3
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance16x16_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}

diff --git a/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libvpx/vp8/common/x86/vp8_asm_stubs.c
index b409293..fb0b57e 100644
--- a/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/libvpx/vp8/common/x86/vp8_asm_stubs.c

@@ -127,7 +127,7 @@
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[16*16]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
     HFilter = vp8_six_tap_mmx[xoffset];
     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -148,7 +148,7 @@
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);  /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -180,7 +180,7 @@
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -206,7 +206,7 @@
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -252,7 +252,7 @@
 
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -292,7 +292,7 @@
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -330,7 +330,7 @@
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -432,7 +432,7 @@
 
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+    DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
 
     if (xoffset)
     {
@@ -480,7 +480,7 @@
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
 
     if (xoffset)
     {
@@ -528,7 +528,7 @@
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
 
     if (xoffset)
     {
@@ -576,7 +576,7 @@
     int dst_pitch
 )
 {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+  DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
 
   if (xoffset)
   {

diff --git a/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
similarity index 100%
rename from libvpx/vp8/common/x86/loopfilter_mmx.asm
rename to libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm


diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c
index e7cf0d9..56e167d 100644
--- a/libvpx/vp8/decoder/decodeframe.c
+++ b/libvpx/vp8/decoder/decodeframe.c

@@ -101,6 +101,8 @@
     int i;
 #if CONFIG_ERROR_CONCEALMENT
     int corruption_detected = 0;
+#else
+    (void)mb_idx;
 #endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -140,7 +142,7 @@
              * Better to use the predictor as reconstruction.
              */
             pbi->frame_corrupt_residual = 1;
-            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
             vp8_conceal_corrupt_mb(xd);
 
 
@@ -149,7 +151,7 @@
             /* force idct to be skipped for B_PRED and use the
              * prediction only for reconstruction
              * */
-            vpx_memset(xd->eobs, 0, 25);
+            memset(xd->eobs, 0, 25);
         }
     }
 #endif
@@ -182,7 +184,7 @@
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
-                vpx_memset(xd->eobs, 0, 25);
+                memset(xd->eobs, 0, 25);
 
             intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
 
@@ -212,7 +214,7 @@
                             (b->qcoeff[0] * DQC[0],
                                 dst, dst_stride,
                                 dst, dst_stride);
-                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -249,14 +251,14 @@
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the
@@ -321,7 +323,7 @@
 
     for (i = 0; i < (int)Border; i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 
@@ -336,7 +338,7 @@
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 
@@ -349,7 +351,7 @@
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 }
@@ -377,7 +379,7 @@
 
     for (i = 0; i < (int)Border; i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 
@@ -395,7 +397,7 @@
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 
@@ -409,7 +411,7 @@
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 }
@@ -444,8 +446,8 @@
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -468,8 +470,8 @@
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -488,8 +490,8 @@
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -566,7 +568,7 @@
 
         /* reset contexts */
         xd->above_context = pc->above_context;
-        vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 
         xd->left_available = 0;
 
@@ -633,7 +635,7 @@
             xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
 
             if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
-              MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+              const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
               xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
               xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
               xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
@@ -916,19 +918,19 @@
     if (pc->frame_type == KEY_FRAME)
     {
         /* Various keyframe initializations */
-        vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
 
         vp8_init_mbmode_probs(pc);
 
         vp8_default_coef_probs(pc);
 
         /* reset the segment feature data to 0 with delta coding (Default state). */
-        vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+        memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
         xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
 
         /* reset the mode ref deltasa for loop filter */
-        vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-        vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+        memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+        memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
 
         /* All buffers are implicitly updated on key frames. */
         pc->refresh_golden_frame = 1;
@@ -1067,12 +1069,11 @@
                 pc->vert_scale = clear[6] >> 6;
             }
             data += 7;
-            clear += 7;
         }
         else
         {
-          vpx_memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
-          vpx_memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
         }
     }
     if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
@@ -1104,7 +1105,7 @@
         {
             xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
 
-            vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+            memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
             /* For each segmentation feature (Quant and loop filter level) */
             for (i = 0; i < MB_LVL_MAX; i++)
@@ -1128,7 +1129,7 @@
         if (xd->update_mb_segmentation_map)
         {
             /* Which macro block level features are enabled */
-            vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+            memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 
             /* Read the probs used to decode the segment id for each macro block. */
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
@@ -1277,7 +1278,7 @@
 #endif
     if (pc->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+        memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
     }
 
     pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
@@ -1326,7 +1327,7 @@
     }
 
     /* clear out the coeff buffer */
-    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+    memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
 
     vp8_decode_mode_mvs(pbi);
 
@@ -1340,7 +1341,7 @@
     }
 #endif
 
-    vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+    memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
     pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
@@ -1379,7 +1380,7 @@
 
     if (pc->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+        memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
         pbi->independent_partitions = prev_independent_partitions;
     }
 

diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
index 35a22c7..1d155e7 100644
--- a/libvpx/vp8/decoder/decodemv.c
+++ b/libvpx/vp8/decoder/decodemv.c

@@ -591,6 +591,8 @@
 static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
                                MB_MODE_INFO *mbmi)
 {
+    (void)mbmi;
+
     /* Read the Macroblock segmentation map if it is being updated explicitly
      * this frame (reset to 0 above by default)
      * By default on a key frame reset all MBs to segment 0

diff --git a/libvpx/vp8/decoder/detokenize.c b/libvpx/vp8/decoder/detokenize.c
index 452ff6c..fcc7533 100644
--- a/libvpx/vp8/decoder/detokenize.c
+++ b/libvpx/vp8/decoder/detokenize.c

@@ -20,8 +20,8 @@
     ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
     ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
 
-    vpx_memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
-    vpx_memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
 
     /* Clear entropy contexts for Y2 blocks */
     if (!x->mode_info_context->mbmi.is_4x4)

diff --git a/libvpx/vp8/decoder/error_concealment.c b/libvpx/vp8/decoder/error_concealment.c
index 4b304c8..bb6d443 100644
--- a/libvpx/vp8/decoder/error_concealment.c
+++ b/libvpx/vp8/decoder/error_concealment.c

@@ -350,7 +350,7 @@
                                  unsigned int first_corrupt)
 {
     int mb_row, mb_col;
-    vpx_memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+    memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
     /* First calculate the overlaps for all blocks */
     for (mb_row = 0; mb_row < mb_rows; ++mb_row)
     {

diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 1d763b6..9015fcb 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c

@@ -58,7 +58,7 @@
     if (!pbi)
         return NULL;
 
-    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+    memset(pbi, 0, sizeof(VP8D_COMP));
 
     if (setjmp(pbi->common.error.jmp))
     {
@@ -87,6 +87,7 @@
     pbi->ec_enabled = oxcf->error_concealment;
     pbi->overlaps = NULL;
 #else
+    (void)oxcf;
     pbi->ec_enabled = 0;
 #endif
     /* Error concealment is activated after a key frame has been
@@ -258,7 +259,7 @@
     return err;
 }
 
-int check_fragments_for_errors(VP8D_COMP *pbi)
+static int check_fragments_for_errors(VP8D_COMP *pbi)
 {
     if (!pbi->ec_active &&
         pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0)
@@ -303,6 +304,8 @@
 {
     VP8_COMMON *cm = &pbi->common;
     int retcode = -1;
+    (void)size;
+    (void)source;
 
     pbi->common.error.error_code = VPX_CODEC_OK;
 
@@ -407,6 +410,7 @@
 #if CONFIG_POSTPROC
     ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
+    (void)flags;
 
     if (pbi->common.frame_to_show)
     {

diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
index fe290cf..6801532 100644
--- a/libvpx/vp8/decoder/threading.c
+++ b/libvpx/vp8/decoder/threading.c

@@ -60,12 +60,12 @@
 
         mbd->segmentation_enabled    = xd->segmentation_enabled;
         mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
-        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+        memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
 
         /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
-        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
         /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
-        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
         /*unsigned char mode_ref_lf_delta_enabled;
         unsigned char mode_ref_lf_delta_update;*/
         mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
@@ -73,10 +73,10 @@
 
         mbd->current_bc = &pbi->mbc[0];
 
-        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
-        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
-        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
-        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+        memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
         mbd->fullpixel_mask = 0xffffffff;
 
@@ -96,6 +96,8 @@
     int i;
 #if CONFIG_ERROR_CONCEALMENT
     int corruption_detected = 0;
+#else
+    (void)mb_idx;
 #endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -135,7 +137,7 @@
              * Better to use the predictor as reconstruction.
              */
             pbi->frame_corrupt_residual = 1;
-            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
             vp8_conceal_corrupt_mb(xd);
 
 
@@ -144,7 +146,7 @@
             /* force idct to be skipped for B_PRED and use the
              * prediction only for reconstruction
              * */
-            vpx_memset(xd->eobs, 0, 25);
+            memset(xd->eobs, 0, 25);
         }
     }
 #endif
@@ -177,7 +179,7 @@
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
-                vpx_memset(xd->eobs, 0, 25);
+                memset(xd->eobs, 0, 25);
 
             intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
 
@@ -227,7 +229,7 @@
                     {
                         vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
                                              dst, dst_stride, dst, dst_stride);
-                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -264,14 +266,14 @@
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the
@@ -358,7 +360,7 @@
 
        /* reset contexts */
        xd->above_context = pc->above_context;
-       vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+       memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 
        xd->left_available = 0;
 
@@ -497,9 +499,9 @@
                if( mb_row != pc->mb_rows-1 )
                {
                    /* Save decoded MB last row data for next-row decoding */
-                   vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
-                   vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
-                   vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                   memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                   memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                   memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
                }
 
                /* save left_col for next MB decoding */
@@ -874,23 +876,23 @@
     if (filter_level)
     {
         /* Set above_row buffer to 127 for decoding first MB row */
-        vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
-        vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
-        vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+        memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
 
         for (j=1; j<pc->mb_rows; j++)
         {
-            vpx_memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
-            vpx_memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
-            vpx_memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
         }
 
         /* Set left_col to 129 initially */
         for (j=0; j<pc->mb_rows; j++)
         {
-            vpx_memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
-            vpx_memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
-            vpx_memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+            memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+            memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+            memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
         }
 
         /* Initialize the loop filter for this frame. */

diff --git a/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
deleted file mode 100644
index 4abe818..0000000
--- a/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ /dev/null

@@ -1,310 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_start_encode|
-    EXPORT |vp8_encode_bool|
-    EXPORT |vp8_stop_encode|
-    EXPORT |vp8_encode_value|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 BOOL_CODER *br
-; r1 unsigned char *source
-; r2 unsigned char *source_end
-|vp8_start_encode| PROC
-    str     r2,  [r0, #vp8_writer_buffer_end]
-    mov     r12, #0
-    mov     r3,  #255
-    mvn     r2,  #23
-    str     r12, [r0, #vp8_writer_lowvalue]
-    str     r3,  [r0, #vp8_writer_range]
-    str     r2,  [r0, #vp8_writer_count]
-    str     r12, [r0, #vp8_writer_pos]
-    str     r1,  [r0, #vp8_writer_buffer]
-    bx      lr
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int bit
-; r2 int probability
-|vp8_encode_bool| PROC
-    push    {r4-r10, lr}
-
-    mov     r4, r2
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    sub     r7, r5, #1                  ; range-1
-
-    cmp     r1, #0
-    mul     r6, r4, r7                  ; ((range-1) * probability)
-
-    mov     r7, #1
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
-
-    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subne   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r1                 ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r10, pc}
-    ENDP
-
-; r0 BOOL_CODER *br
-|vp8_stop_encode| PROC
-    push    {r4-r10, lr}
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    mov     r10, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r1                 ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     stop_encode_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r10, pc}
-
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int data
-; r2 int bits
-|vp8_encode_value| PROC
-    push    {r4-r12, lr}
-
-    mov     r10, r2
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    rsb     r4, r10, #32                 ; 32-n
-
-    ; v is kept in r1 during the token pack loop
-    lsl     r1, r1, r4                  ; r1 = v << 32 - n
-
-encode_value_loop
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r1, r1, #1                  ; bit = v >> n
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_ev      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_ev
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_ev
-token_zero_while_loop_ev
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_ev
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_ev
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_ev
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r11                ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_ev
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     encode_value_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r12, pc}
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
deleted file mode 100644
index 90a141c..0000000
--- a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ /dev/null

@@ -1,317 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-
-; r0 vp8_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #16
-
-    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 8
-    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
-    str     r2, [sp, #0]
-    str     r3, [sp, #8]                ; save vp8_coef_encodings
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-    b       check_p_lt_stop
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #8]                ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #56]               ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12               ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #16
-    pop     {r4-r12, pc}
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
deleted file mode 100644
index 3a8d17a..0000000
--- a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ /dev/null

@@ -1,352 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 VP8_COMP *cpi
-; r1 vp8_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #24
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r2, [sp, #20]               ; save vp8_coef_encodings
-    str     r5, [sp, #12]               ; save mb_rows
-    str     r3, [sp, #8]                ; save vp8_extra_bits
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-
-    mov     r0, r1                      ; keep same as other loops
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actuall work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #20]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #64]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                 ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #64]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #8]                ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12               ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, #1
-    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
-    str     r6, [sp, #12]
-    bne     mb_row_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #24
-    pop     {r4-r12, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-
-    END

diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
deleted file mode 100644
index e9aa495..0000000
--- a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ /dev/null

@@ -1,471 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 const unsigned char *cx_data_end
-; r3 int num_part
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #40
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r5, [sp, #36]               ; save mb_rows
-    str     r1, [sp, #24]               ; save ptr = cx_data
-    str     r3, [sp, #20]               ; save num_part
-    str     r2, [sp, #8]                ; save cx_data_end
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-    str     r7, [sp, #32]               ; store start of cpi->tp_list
-
-    ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
-    add     r0, r0, r11
-
-    mov     r11, #0
-    str     r11, [sp, #28]              ; i
-
-numparts_loop
-    ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
-    add     r0, r2                      ; bc[i + 1]
-
-    ldr     r10, [sp, #24]              ; ptr
-    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
-    subs    r5, r5, r11                 ; move start point with each partition
-                                        ; mb_rows starts at i
-    str     r5,  [sp, #12]
-
-    ; Reset all of the VP8 Writer data for each partition that
-    ; is processed.
-    ; start_encode
-
-    ldr     r3, [sp, #8]
-    str     r3, [r0, #vp8_writer_buffer_end]
-
-    mov     r2, #0                      ; vp8_writer_lowvalue
-    mov     r5, #255                    ; vp8_writer_range
-    mvn     r3, #23                     ; vp8_writer_count
-
-    str     r2,  [r0, #vp8_writer_pos]
-    str     r10, [r0, #vp8_writer_buffer]
-
-    ble     end_partition               ; if (mb_rows <= 0) end partition
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actual work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #80]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #84]                ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7                      ; count = -8
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12                ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r10, [sp, #20]              ; num_parts
-    mov     r1, #TOKENLIST_SZ
-    mul     r1, r10, r1
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, r10
-    add     r7, r7, r1                  ; next element in the array
-    str     r6, [sp, #12]
-    bgt     mb_row_loop
-
-end_partition
-    mov     r12, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r12, r12, #1
-    bne     stop_encode_loop
-
-    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
-    ldr     r12, [sp, #24]              ; ptr
-    add     r12, r12, r4                ; ptr += w->pos
-    str     r12, [sp, #24]
-
-    ldr     r11, [sp, #28]              ; i
-    ldr     r10, [sp, #20]              ; num_parts
-
-    add     r11, r11, #1                ; i++
-    str     r11, [sp, #28]
-
-    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
-    mov     r1, #TOKENLIST_SZ
-    add     r7, r7, r1                  ; next element in cpi->tp_list
-    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
-
-    cmp     r10, r11
-    bgt     numparts_loop
-
-    add     sp, sp, #40
-    pop     {r4-r12, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-_VP8_COMP_bc_
-    DCD     vp8_comp_bc
-_vp8_writer_sz_
-    DCD     vp8_writer_sz
-
-    END

diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
deleted file mode 100644
index de35a1e..0000000
--- a/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ /dev/null

@@ -1,225 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_armv6|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *b
-; r1    BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
-    stmfd   sp!, {r1, r4-r11, lr}
-
-    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
-    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
-    ldr     r5, [r0, #vp8_block_round]      ; round
-    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
-    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
-    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
-
-    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
-                                    ; is used to update the counter so that
-                                    ; it can be used to mark nonzero
-                                    ; quantized coefficient pairs.
-
-    mov     r1, #0                  ; flags for quantized coeffs
-
-    ; PART 1: quantization and dequantization loop
-loop
-    ldr     r9, [r3], #4            ; [z1 | z0]
-    ldr     r10, [r5], #4           ; [r1 | r0]
-    ldr     r11, [r4], #4           ; [q1 | q0]
-
-    ssat16  lr, #1, r9              ; [sz1 | sz0]
-    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
-    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
-    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
-
-    ldr     r12, [r3], #4           ; [z3 | z2]
-
-    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
-    smultt  r9, r9, r11             ; [(x1+r1)*q1]
-
-    ldr     r10, [r5], #4           ; [r3 | r2]
-
-    ssat16  r11, #1, r12            ; [sz3 | sz2]
-    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
-    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
-    ldr     r9, [r4], #4            ; [q3 | q2]
-    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
-
-    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
-
-    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
-    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
-    smultt  r12, r12, r9            ; [(x3+r3)*q3]
-
-    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
-
-    cmp     r0, #0                  ; check if zero
-    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
-
-    str     r0, [r6], #4            ; *qcoeff++ = x
-    ldr     r9, [r8], #4            ; [dq1 | dq0]
-
-    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
-    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
-    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
-
-    cmp     r10, #0                 ; check if zero
-    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
-
-    str     r10, [r6], #4           ; *qcoeff++ = x
-    ldr     r11, [r8], #4           ; [dq3 | dq2]
-
-    smulbb  r12, r0, r9             ; [x0*dq0]
-    smultt  r0, r0, r9              ; [x1*dq1]
-
-    smulbb  r9, r10, r11            ; [x2*dq2]
-    smultt  r10, r10, r11           ; [x3*dq3]
-
-    lsls    r2, r2, #2              ; update loop counter
-    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
-    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
-    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
-    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
-    add     r7, r7, #8              ; dqcoeff += 8
-    bne     loop
-
-    ; PART 2: check position for eob...
-    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
-    mov     lr, #0                  ; init eob
-    cmp     r1, #0                  ; coeffs after quantization?
-    ldr     r12, [r11, #vp8_blockd_eob]
-    beq     end                     ; skip eob calculations if all zero
-
-    ldr     r0, [r11, #vp8_blockd_qcoeff]
-
-    ; check shortcut for nonzero qcoeffs
-    tst    r1, #0x80
-    bne    quant_coeff_15_14
-    tst    r1, #0x20
-    bne    quant_coeff_13_11
-    tst    r1, #0x8
-    bne    quant_coeff_12_7
-    tst    r1, #0x40
-    bne    quant_coeff_10_9
-    tst    r1, #0x10
-    bne    quant_coeff_8_3
-    tst    r1, #0x2
-    bne    quant_coeff_6_5
-    tst    r1, #0x4
-    bne    quant_coeff_4_2
-    b      quant_coeff_1_0
-
-quant_coeff_15_14
-    ldrh    r2, [r0, #30]       ; rc=15, i=15
-    mov     lr, #16
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #28]       ; rc=14, i=14
-    mov     lr, #15
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_13_11
-    ldrh    r2, [r0, #22]       ; rc=11, i=13
-    mov     lr, #14
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_12_7
-    ldrh    r3, [r0, #14]       ; rc=7,  i=12
-    mov     lr, #13
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #20]       ; rc=10, i=11
-    mov     lr, #12
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_10_9
-    ldrh    r3, [r0, #26]       ; rc=13, i=10
-    mov     lr, #11
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #24]       ; rc=12, i=9
-    mov     lr, #10
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_8_3
-    ldrh    r3, [r0, #18]       ; rc=9,  i=8
-    mov     lr, #9
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #12]       ; rc=6,  i=7
-    mov     lr, #8
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_6_5
-    ldrh    r3, [r0, #6]        ; rc=3,  i=6
-    mov     lr, #7
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #4]        ; rc=2,  i=5
-    mov     lr, #6
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_4_2
-    ldrh    r3, [r0, #10]       ; rc=5,  i=4
-    mov     lr, #5
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #16]       ; rc=8,  i=3
-    mov     lr, #4
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #8]        ; rc=4,  i=2
-    mov     lr, #3
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_1_0
-    ldrh    r2, [r0, #2]        ; rc=1,  i=1
-    mov     lr, #2
-    cmp     r2, #0
-    bne     end
-
-    mov     lr, #1              ; rc=0,  i=0
-
-end
-    strb    lr, [r12]
-    ldmfd   sp!, {r1, r4-r11, pc}
-
-    ENDP
-
-loop_count
-    DCD     0x1000000
-
-    END
-

diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
deleted file mode 100644
index 000805d..0000000
--- a/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ /dev/null

@@ -1,138 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
-;      So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
-    push    {r4-r9, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     r4, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r5, [r0, #0x0]      ; load 4 src pixels
-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r5, r6          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0x4]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-    ldr     r5, [r0, #0x8]      ; load 4 src pixels
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0xc]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    subs    r12, r12, #1        ; next row
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r1, [sp, #28]       ; get address of sse
-    mov     r0, r4              ; return sse
-    str     r4, [r1]            ; store sse
-
-    pop     {r4-r9, pc}
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
deleted file mode 100644
index 05746cf..0000000
--- a/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ /dev/null

@@ -1,272 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_subtract_mby_armv6|
-    EXPORT  |vp8_subtract_mbuv_armv6|
-    EXPORT  |vp8_subtract_b_armv6|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *be
-; r1    BLOCKD *bd
-; r2    int pitch
-|vp8_subtract_b_armv6| PROC
-
-    stmfd   sp!, {r4-r9}
-
-    ldr     r4, [r0, #vp8_block_base_src]
-    ldr     r5, [r0, #vp8_block_src]
-    ldr     r6, [r0, #vp8_block_src_diff]
-
-    ldr     r3, [r4]
-    ldr     r7, [r0, #vp8_block_src_stride]
-    add     r3, r3, r5          ; src = *base_src + src
-    ldr     r8, [r1, #vp8_blockd_predictor]
-
-    mov     r9, #4              ; loop count
-
-loop_block
-
-    ldr     r0, [r3], r7        ; src
-    ldr     r1, [r8], r2        ; pred
-
-    uxtb16  r4, r0              ; [s2 | s0]
-    uxtb16  r5, r1              ; [p2 | p0]
-    uxtb16  r0, r0, ror #8      ; [s3 | s1]
-    uxtb16  r1, r1, ror #8      ; [p3 | p1]
-
-    usub16  r4, r4, r5          ; [d2 | d0]
-    usub16  r5, r0, r1          ; [d3 | d1]
-
-    subs    r9, r9, #1          ; decrement loop counter
-
-    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
-    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
-
-    str     r0, [r6, #0]        ; diff
-    str     r1, [r6, #4]        ; diff
-
-    add     r6, r6, r2, lsl #1  ; update diff pointer
-    bne     loop_block
-
-    ldmfd   sp!, {r4-r9}
-    mov     pc, lr
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *usrc
-; r2    unsigned char *vsrc
-; r3    int src_stride
-; sp    unsigned char *upred
-; sp    unsigned char *vpred
-; sp    int pred_stride
-|vp8_subtract_mbuv_armv6| PROC
-
-    stmfd   sp!, {r4-r11}
-
-    add     r0, r0, #512        ; set *diff point to Cb
-    mov     r4, #8              ; loop count
-    ldr     r5, [sp, #32]       ; upred
-    ldr     r12, [sp, #40]      ; pred_stride
-
-    ; Subtract U block
-loop_u
-    ldr     r6, [r1]            ; usrc      (A)
-    ldr     r7, [r5]            ; upred     (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; usrc      (B)
-    ldr     r11, [r5, #4]       ; upred     (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r1, r1, r3          ; update usrc pointer
-    add     r5, r5, r12         ; update upred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_u
-
-    ldr     r5, [sp, #36]       ; vpred
-    mov     r4, #8              ; loop count
-
-    ; Subtract V block
-loop_v
-    ldr     r6, [r2]            ; vsrc      (A)
-    ldr     r7, [r5]            ; vpred     (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r2, #4]       ; vsrc      (B)
-    ldr     r11, [r5, #4]       ; vpred     (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r2, r2, r3          ; update vsrc pointer
-    add     r5, r5, r12         ; update vpred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_v
-
-    ldmfd   sp!, {r4-r11}
-    bx      lr
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *src
-; r2    int src_stride
-; r3    unsigned char *pred
-; sp    int pred_stride
-|vp8_subtract_mby_armv6| PROC
-
-    stmfd   sp!, {r4-r11}
-    ldr     r12, [sp, #32]      ; pred_stride
-    mov     r4, #16
-loop
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r3]            ; pred      (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r3, #4]       ; pred      (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    ldr     r10, [r1, #8]       ; src       (C)
-    ldr     r11, [r3, #8]       ; pred      (C)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    uxtb16  r8, r10             ; [s2 | s0] (C)
-    str     r9, [r0], #4        ; diff      (B)
-
-    uxtb16  r9, r11             ; [p2 | p0] (C)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (C)
-    usub16  r7, r10, r11        ; [d3 | d1] (C)
-
-    ldr     r10, [r1, #12]      ; src       (D)
-    ldr     r11, [r3, #12]      ; pred      (D)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
-
-    str     r8, [r0], #4        ; diff      (C)
-    uxtb16  r8, r10             ; [s2 | s0] (D)
-    str     r9, [r0], #4        ; diff      (C)
-
-    uxtb16  r9, r11             ; [p2 | p0] (D)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (D)
-    usub16  r7, r10, r11        ; [d3 | d1] (D)
-
-    add     r1, r1, r2          ; update src pointer
-    add     r3, r3, r12         ; update pred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
-    str     r8, [r0], #4        ; diff      (D)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (D)
-
-    bne     loop
-
-    ldmfd   sp!, {r4-r11}
-    bx      lr
-
-    ENDP
-
-    END
-

diff --git a/libvpx/vp8/encoder/arm/boolhuff_arm.c b/libvpx/vp8/encoder/arm/boolhuff_arm.c
deleted file mode 100644
index 17a941b..0000000
--- a/libvpx/vp8/encoder/arm/boolhuff_arm.c
+++ /dev/null

@@ -1,41 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/boolhuff.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-const unsigned int vp8_prob_cost[256] =
-{
-    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
-
-int vp8_validate_buffer_arm(const unsigned char *start,
-                            size_t               len,
-                            const unsigned char *end,
-                            struct vpx_internal_error_info *error)
-{
-    return validate_buffer(start, len, end, error);
-}

diff --git a/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
deleted file mode 100644
index 9374310..0000000
--- a/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ /dev/null

@@ -1,258 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_neon|
-    EXPORT  |vp8_fast_quantize_b_pair_neon|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
-|vp8_fast_quantize_b_pair_neon| PROC
-
-    stmfd           sp!, {r4-r9}
-    vstmdb          sp!, {q4-q7}
-
-    ldr             r4, [r0, #vp8_block_coeff]
-    ldr             r5, [r0, #vp8_block_quant_fast]
-    ldr             r6, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z
-
-    ldr             r7, [r2, #vp8_blockd_qcoeff]
-
-    vabs.s16        q4, q0              ; calculate x = abs(z)
-    vabs.s16        q5, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
-
-    ldr             r4, [r1, #vp8_block_coeff]
-
-    vadd.s16        q4, q6              ; x + Round
-    vadd.s16        q5, q7
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z2
-
-    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q5, q9
-
-    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
-    vabs.s16        q11, q1
-    vshr.s16        q12, q0, #15        ; sz2
-    vshr.s16        q13, q1, #15
-
-    ;modify data to have its original sign
-    veor.s16        q4, q2              ; y^sz
-    veor.s16        q5, q3
-
-    vadd.s16        q10, q6             ; x2 + Round
-    vadd.s16        q11, q7
-
-    ldr             r8, [r2, #vp8_blockd_dequant]
-
-    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q11, q9
-
-    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
-    vshr.s16        q5, #1
-
-    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
-
-    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q5, q3
-
-    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q11, #1
-
-    ldr             r9, [r2, #vp8_blockd_dqcoeff]
-
-    veor.s16        q10, q12            ; y2^sz2
-    veor.s16        q11, q13
-
-    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
-
-
-    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q11, q13
-
-    ldr             r6, [r3, #vp8_blockd_qcoeff]
-
-    vmul.s16        q2, q6, q4          ; x * Dequant
-    vmul.s16        q3, q7, q5
-
-    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
-
-    vmul.s16        q12, q6, q10        ; x2 * Dequant
-    vmul.s16        q13, q7, q11
-
-    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
-
-    vtst.16         q14, q4, q8         ; now find eob
-    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
-
-    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
-
-    ldr             r7, [r3, #vp8_blockd_dqcoeff]
-
-    vand            q0, q6, q14         ; get all valid numbers from scan array
-    vand            q1, q7, q15
-
-    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
-
-    vtst.16         q2, q10, q8         ; now find eob
-    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
-
-    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
-
-    vand            q10, q6, q2         ; get all valid numbers from scan array
-    vand            q11, q7, q3
-    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
-
-    vmax.u16        d0, d0, d1
-    vmax.u16        d20, d20, d21
-    vmovl.u16       q0, d0
-    vmovl.u16       q10, d20
-
-    vmax.u32        d0, d0, d1
-    vmax.u32        d20, d20, d21
-    vpmax.u32       d0, d0, d0
-    vpmax.u32       d20, d20, d20
-
-    ldr             r4, [r2, #vp8_blockd_eob]
-    ldr             r5, [r3, #vp8_blockd_eob]
-
-    vst1.8          {d0[0]}, [r4]       ; store eob
-    vst1.8          {d20[0]}, [r5]      ; store eob
-
-    vldmia          sp!, {q4-q7}
-    ldmfd           sp!, {r4-r9}
-    bx              lr
-
-    ENDP
-
-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-|vp8_fast_quantize_b_neon| PROC
-
-    stmfd           sp!, {r4-r7}
-
-    ldr             r3, [r0, #vp8_block_coeff]
-    ldr             r4, [r0, #vp8_block_quant_fast]
-    ldr             r5, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r3@128]  ; load z
-    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
-    ldr             r6, [r1, #vp8_blockd_qcoeff]
-    ldr             r7, [r1, #vp8_blockd_dqcoeff]
-    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
-
-    vabs.s16        q12, q0             ; calculate x = abs(z)
-    vabs.s16        q13, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vmov            r2, r3, d28         ; check if all zero (step 3)
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
-
-    vadd.s16        q12, q14            ; x + Round
-    vadd.s16        q13, q15
-
-    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
-
-    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q13, q9
-
-    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    ldr             r4, [r1, #vp8_blockd_dequant]
-
-    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q13, #1
-
-    ldr             r5, [r1, #vp8_blockd_eob]
-
-    orr             r2, r2, r3          ; check if all zero (step 4)
-    cmp             r2, #0              ; check if all zero (step 5)
-    beq             zero_output         ; check if all zero (step 6)
-
-    ;modify data to have its original sign
-    veor.s16        q12, q2             ; y^sz
-    veor.s16        q13, q3
-
-    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q13, q3
-
-    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
-
-    vtst.16         q14, q12, q8        ; now find eob
-    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
-
-    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
-
-    vand            q10, q10, q14       ; get all valid numbers from scan array
-    vand            q11, q11, q15
-
-
-    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
-    vmax.u16        d0, d0, d1
-    vmovl.u16       q0, d0
-
-    vmul.s16        q2, q12             ; x * Dequant
-    vmul.s16        q3, q13
-
-    vmax.u32        d0, d0, d1
-    vpmax.u32       d0, d0, d0
-
-    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
-
-    vst1.8          {d0[0]}, [r5]       ; store eob
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-zero_output
-    strb            r2, [r5]            ; store eob
-    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
-    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-; default inverse zigzag table is defined in vp8/common/entropy.c
-    ALIGN 16    ; enable use of @128 bit aligned loads
-inv_zig_zag
-    DCW 0x0001, 0x0002, 0x0006, 0x0007
-    DCW 0x0003, 0x0005, 0x0008, 0x000d
-    DCW 0x0004, 0x0009, 0x000c, 0x000e
-    DCW 0x000a, 0x000b, 0x000f, 0x0010
-
-    END
-

diff --git a/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
new file mode 100644
index 0000000..e5824bf
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c

@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+static const uint16_t inv_zig_zag[16] = {
+    1,  2,  6,   7,
+    3,  5,  8,  13,
+    4,  9,  12, 14,
+    10, 11, 15, 16
+};
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
+    const int16x8_t one_q = vdupq_n_s16(-1),
+                    z0 = vld1q_s16(b->coeff),
+                    z1 = vld1q_s16(b->coeff + 8),
+                    round0 = vld1q_s16(b->round),
+                    round1 = vld1q_s16(b->round + 8),
+                    quant0 = vld1q_s16(b->quant_fast),
+                    quant1 = vld1q_s16(b->quant_fast + 8),
+                    dequant0 = vld1q_s16(d->dequant),
+                    dequant1 = vld1q_s16(d->dequant + 8);
+    const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
+                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
+    int16x8_t x0, x1, sz0, sz1, y0, y1;
+    uint16x8_t eob0, eob1;
+    uint16x4_t eob_d16;
+    uint32x2_t eob_d32;
+    uint32x4_t eob_q32;
+
+    /* sign of z: z >> 15 */
+    sz0 = vshrq_n_s16(z0, 15);
+    sz1 = vshrq_n_s16(z1, 15);
+
+    /* x = abs(z) */
+    x0 = vabsq_s16(z0);
+    x1 = vabsq_s16(z1);
+
+    /* x += round */
+    x0 = vaddq_s16(x0, round0);
+    x1 = vaddq_s16(x1, round1);
+
+    /* y = 2 * (x * quant) >> 16 */
+    y0 = vqdmulhq_s16(x0, quant0);
+    y1 = vqdmulhq_s16(x1, quant1);
+
+    /* Compensate for doubling in vqdmulhq */
+    y0 = vshrq_n_s16(y0, 1);
+    y1 = vshrq_n_s16(y1, 1);
+
+    /* Restore sign bit */
+    y0 = veorq_s16(y0, sz0);
+    y1 = veorq_s16(y1, sz1);
+    x0 = vsubq_s16(y0, sz0);
+    x1 = vsubq_s16(y1, sz1);
+
+    /* find non-zero elements */
+    eob0 = vtstq_s16(x0, one_q);
+    eob1 = vtstq_s16(x1, one_q);
+
+    /* mask zig zag */
+    eob0 = vandq_u16(eob0, zig_zag0);
+    eob1 = vandq_u16(eob1, zig_zag1);
+
+    /* select the largest value */
+    eob0 = vmaxq_u16(eob0, eob1);
+    eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
+    eob_q32 = vmovl_u16(eob_d16);
+    eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
+    eob_d32 = vpmax_u32(eob_d32, eob_d32);
+
+    /* qcoeff = x */
+    vst1q_s16(d->qcoeff, x0);
+    vst1q_s16(d->qcoeff + 8, x1);
+
+    /* dqcoeff = x * dequant */
+    vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
+    vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
+
+    vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+}

diff --git a/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
deleted file mode 100644
index ec8071e..0000000
--- a/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null

@@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/loopfilter.h"
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
-                                    unsigned char *src_ptr,
-                                    int sz);
-
-
-void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                      YV12_BUFFER_CONFIG *dst_ybc)
-{
-    unsigned char *src_y, *dst_y;
-    int yheight;
-    int ystride;
-    int yoffset;
-    int linestocopy;
-
-    yheight  = src_ybc->y_height;
-    ystride  = src_ybc->y_stride;
-
-    /* number of MB rows to use in partial filtering */
-    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
-    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
-
-    /* Copy extra 4 so that full filter context is available if filtering done
-     * on the copied partial frame and not original. Partial filter does mb
-     * filtering for top row also, which can modify3 pixels above.
-     */
-    linestocopy += 4;
-    /* partial image starts at ~middle of frame (macroblock border) */
-    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
-    src_y = src_ybc->y_buffer + yoffset;
-    dst_y = dst_ybc->y_buffer + yoffset;
-
-    vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
-}

diff --git a/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
deleted file mode 100644
index 5ea8dd8..0000000
--- a/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null

@@ -1,221 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_fdct4x4_neon|
-    EXPORT  |vp8_short_fdct8x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
-    ALIGN 16    ; enable use of @128 bit aligned loads
-coeff
-    DCW      5352,  5352,  5352, 5352
-    DCW      2217,  2217,  2217, 2217
-    DCD     14500, 14500, 14500, 14500
-    DCD      7500,  7500,  7500, 7500
-    DCD     12000, 12000, 12000, 12000
-    DCD     51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
-    ; Part one
-    vld1.16         {d0}, [r0@64], r2
-    adr             r12, coeff
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {d2}, [r0@64], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {d3}, [r0@64], r2
-
-    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
-    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
-    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
-    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
-    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
-
-    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
-    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
-    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-
-    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vmov.s16        d26, #7
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
-    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
-    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
-    vadd.s16        d4, d4, d26     ; a1 + 7
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
-    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
-
-    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
-    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
-
-    vceq.s16        d4, d7, #0
-
-    vshr.s16        d0, d0, #4
-    vshr.s16        d2, d2, #4
-
-    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
-
-    vmvn            d4, d4
-    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
-    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
-    ; Part one
-
-    vld1.16         {q0}, [r0@128], r2
-    adr             r12, coeff
-    vld1.16         {q1}, [r0@128], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {q2}, [r0@128], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {q3}, [r0@128], r2
-
-    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
-    vtrn.32         q0, q2          ; [A0|B0]
-    vtrn.32         q1, q3          ; [A1|B1]
-    vtrn.16         q0, q1          ; [A2|B2]
-    vtrn.16         q2, q3          ; [A3|B3]
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
-    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
-    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q11, q11, #3    ; a1 << 3
-    vshl.s16        q12, q12, #3    ; b1 << 3
-    vshl.s16        q13, q13, #3    ; c1 << 3
-    vshl.s16        q14, q14, #3    ; d1 << 3
-
-    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
-    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
-
-    vmov.s16        q11, q9         ; 14500
-    vmov.s16        q12, q10        ; 7500
-
-    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
-    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
-    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
-    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
-
-    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
-    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
-    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
-
-    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
-    vtrn.32         q0, q2          ; q0=[A0 | B0]
-    vtrn.32         q1, q3          ; q1=[A4 | B4]
-    vtrn.16         q0, q1          ; q2=[A8 | B8]
-    vtrn.16         q2, q3          ; q3=[A12|B12]
-
-    vmov.s16        q15, #7
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
-    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
-    vadd.s16        q11, q11, q15   ; a1 + 7
-    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
-
-    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
-    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
-
-    vmov.s16        q11, q9         ; 12000
-    vmov.s16        q12, q10        ; 51000
-
-    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
-    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
-
-
-    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
-    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
-    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
-    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
-
-    vceq.s16        q14, q14, #0
-
-    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
-    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
-
-    vmvn            q14, q14
-
-    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
-
-    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
-
-    vst1.16         {q0, q1}, [r1@128]! ; block A
-    vst1.16         {q2, q3}, [r1@128]! ; block B
-
-    bx              lr
-
-    ENDP
-
-    END
-

diff --git a/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000..391e5f9
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c

@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+    uint16x4_t d4u16;
+    int16x8_t q0s16, q1s16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+    q11s32 = vdupq_n_s32(12000);
+    q12s32 = vdupq_n_s32(51000);
+
+    // Part one
+    pitch >>= 1;
+    d0s16 = vld1_s16(input);
+    input += pitch;
+    d1s16 = vld1_s16(input);
+    input += pitch;
+    d2s16 = vld1_s16(input);
+    input += pitch;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d4s16 = vshl_n_s16(d4s16, 3);
+    d5s16 = vshl_n_s16(d5s16, 3);
+    d6s16 = vshl_n_s16(d6s16, 3);
+    d7s16 = vshl_n_s16(d7s16, 3);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+    q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 12);
+    d3s16 = vshrn_n_s32(q10s32, 12);
+
+    // Part two
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d26s16 = vdup_n_s16(7);
+    d4s16 = vadd_s16(d4s16, d26s16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+    dEmptys16 = vdup_n_s16(0);
+    d4u16 = vceq_s16(d7s16, dEmptys16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+
+    q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+    d4u16 = vmvn_u16(d4u16);
+    d1s16 = vshrn_n_s32(q11s32, 16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+    d3s16 = vshrn_n_s32(q12s32, 16);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+
+void vp8_short_fdct8x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+    uint16x4_t d28u16, d29u16;
+    uint16x8_t q14u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x8x2_t v2tmp0, v2tmp1;
+    int32x4x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+
+    // Part one
+    pitch >>= 1;
+    q0s16 = vld1q_s16(input);
+    input += pitch;
+    q1s16 = vld1q_s16(input);
+    input += pitch;
+    q2s16 = vld1q_s16(input);
+    input += pitch;
+    q3s16 = vld1q_s16(input);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q11s16 = vshlq_n_s16(q11s16, 3);
+    q12s16 = vshlq_n_s16(q12s16, 3);
+    q13s16 = vshlq_n_s16(q13s16, 3);
+    q14s16 = vshlq_n_s16(q14s16, 3);
+
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q2s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d2s16 = vshrn_n_s32(q9s32, 12);
+    d6s16 = vshrn_n_s32(q10s32, 12);
+    d3s16 = vshrn_n_s32(q11s32, 12);
+    d7s16 = vshrn_n_s32(q12s32, 12);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    // Part two
+    q9s32 = vdupq_n_s32(12000);
+    q10s32 = vdupq_n_s32(51000);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q15s16 = vdupq_n_s16(7);
+    q11s16 = vaddq_s16(q11s16, q15s16);
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q1s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d2s16 = vget_low_s16(q1s16);
+    d3s16 = vget_high_s16(q1s16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d4s16 = vshr_n_s16(d1s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+    d6s16 = vshr_n_s16(d3s16, 4);
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 16);
+    d3s16 = vshrn_n_s32(q10s32, 16);
+    d5s16 = vshrn_n_s32(q11s32, 16);
+    d7s16 = vshrn_n_s32(q12s32, 16);
+
+    qEmptys16 = vdupq_n_s16(0);
+    q14u16 = vceqq_s16(q14s16, qEmptys16);
+    q14u16 = vmvnq_u16(q14u16);
+
+    d28u16 = vget_low_u16(q14u16);
+    d29u16 = vget_high_u16(q14u16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+    d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    vst1q_s16(output + 16, q2s16);
+    vst1q_s16(output + 24, q3s16);
+    return;
+}

diff --git a/libvpx/vp8/encoder/arm/neon/subtract_neon.asm b/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
deleted file mode 100644
index 840cb33..0000000
--- a/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null

@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_subtract_b_neon|
-    EXPORT |vp8_subtract_mby_neon|
-    EXPORT |vp8_subtract_mbuv_neon|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
-    stmfd   sp!, {r4-r7}
-
-    ldr     r3, [r0, #vp8_block_base_src]
-    ldr     r4, [r0, #vp8_block_src]
-    ldr     r5, [r0, #vp8_block_src_diff]
-    ldr     r3, [r3]
-    ldr     r6, [r0, #vp8_block_src_stride]
-    add     r3, r3, r4                      ; src = *base_src + src
-    ldr     r7, [r1, #vp8_blockd_predictor]
-
-    vld1.8          {d0}, [r3], r6          ;load src
-    vld1.8          {d1}, [r7], r2          ;load pred
-    vld1.8          {d2}, [r3], r6
-    vld1.8          {d3}, [r7], r2
-    vld1.8          {d4}, [r3], r6
-    vld1.8          {d5}, [r7], r2
-    vld1.8          {d6}, [r3], r6
-    vld1.8          {d7}, [r7], r2
-
-    vsubl.u8        q10, d0, d1
-    vsubl.u8        q11, d2, d3
-    vsubl.u8        q12, d4, d5
-    vsubl.u8        q13, d6, d7
-
-    mov             r2, r2, lsl #1
-
-    vst1.16         {d20}, [r5], r2         ;store diff
-    vst1.16         {d22}, [r5], r2
-    vst1.16         {d24}, [r5], r2
-    vst1.16         {d26}, [r5], r2
-
-    ldmfd   sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
-;                           unsigned char *pred, int pred_stride)
-|vp8_subtract_mby_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    mov             r12, #4
-    ldr             r4, [sp, #80]           ; pred_stride
-    mov             r6, #32                 ; "diff" stride x2
-    add             r5, r0, #16             ; second diff pointer
-
-subtract_mby_loop
-    vld1.8          {q0}, [r1], r2          ;load src
-    vld1.8          {q1}, [r3], r4          ;load pred
-    vld1.8          {q2}, [r1], r2
-    vld1.8          {q3}, [r3], r4
-    vld1.8          {q4}, [r1], r2
-    vld1.8          {q5}, [r3], r4
-    vld1.8          {q6}, [r1], r2
-    vld1.8          {q7}, [r3], r4
-
-    vsubl.u8        q8, d0, d2
-    vsubl.u8        q9, d1, d3
-    vsubl.u8        q10, d4, d6
-    vsubl.u8        q11, d5, d7
-    vsubl.u8        q12, d8, d10
-    vsubl.u8        q13, d9, d11
-    vsubl.u8        q14, d12, d14
-    vsubl.u8        q15, d13, d15
-
-    vst1.16         {q8}, [r0], r6          ;store diff
-    vst1.16         {q9}, [r5], r6
-    vst1.16         {q10}, [r0], r6
-    vst1.16         {q11}, [r5], r6
-    vst1.16         {q12}, [r0], r6
-    vst1.16         {q13}, [r5], r6
-    vst1.16         {q14}, [r0], r6
-    vst1.16         {q15}, [r5], r6
-
-    subs            r12, r12, #1
-    bne             subtract_mby_loop
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-    ENDP
-
-;=================================
-;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-
-|vp8_subtract_mbuv_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    ldr             r4, [sp, #80]       ; upred
-    ldr             r5, [sp, #84]       ; vpred
-    ldr             r6, [sp, #88]       ; pred_stride
-    add             r0, r0, #512        ; short *udiff = diff + 256;
-    mov             r12, #32            ; "diff" stride x2
-    add             r7, r0, #16         ; second diff pointer
-
-;u
-    vld1.8          {d0}, [r1], r3      ;load usrc
-    vld1.8          {d1}, [r4], r6      ;load upred
-    vld1.8          {d2}, [r1], r3
-    vld1.8          {d3}, [r4], r6
-    vld1.8          {d4}, [r1], r3
-    vld1.8          {d5}, [r4], r6
-    vld1.8          {d6}, [r1], r3
-    vld1.8          {d7}, [r4], r6
-    vld1.8          {d8}, [r1], r3
-    vld1.8          {d9}, [r4], r6
-    vld1.8          {d10}, [r1], r3
-    vld1.8          {d11}, [r4], r6
-    vld1.8          {d12}, [r1], r3
-    vld1.8          {d13}, [r4], r6
-    vld1.8          {d14}, [r1], r3
-    vld1.8          {d15}, [r4], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-;v
-    vld1.8          {d0}, [r2], r3      ;load vsrc
-    vld1.8          {d1}, [r5], r6      ;load vpred
-    vld1.8          {d2}, [r2], r3
-    vld1.8          {d3}, [r5], r6
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d5}, [r5], r6
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d7}, [r5], r6
-    vld1.8          {d8}, [r2], r3
-    vld1.8          {d9}, [r5], r6
-    vld1.8          {d10}, [r2], r3
-    vld1.8          {d11}, [r5], r6
-    vld1.8          {d12}, [r2], r3
-    vld1.8          {d13}, [r5], r6
-    vld1.8          {d14}, [r2], r3
-    vld1.8          {d15}, [r5], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
deleted file mode 100644
index d219e2d..0000000
--- a/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null

@@ -1,72 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_memcpy_partial_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;this is not a full memcpy function!!!
-;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
-;                             int sz);
-|vp8_memcpy_partial_neon| PROC
-    vpush               {d8-d15}
-    ;pld                [r1]                        ;preload pred data
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
-    vld1.8          {q0, q1}, [r1]!                 ;load src data
-    subs            r12, r12, #1
-    vld1.8          {q2, q3}, [r1]!
-    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
-    vld1.8          {q4, q5}, [r1]!
-    vst1.8          {q2, q3}, [r0]!
-    vld1.8          {q6, q7}, [r1]!
-    vst1.8          {q4, q5}, [r0]!
-    vld1.8          {q8, q9}, [r1]!
-    vst1.8          {q6, q7}, [r0]!
-    vld1.8          {q10, q11}, [r1]!
-    vst1.8          {q8, q9}, [r0]!
-    vld1.8          {q12, q13}, [r1]!
-    vst1.8          {q10, q11}, [r0]!
-    vld1.8          {q14, q15}, [r1]!
-    vst1.8          {q12, q13}, [r0]!
-    vst1.8          {q14, q15}, [r0]!
-
-    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    bne             memcpy_neon_loop
-
-    ands            r3, r2, #0xff                   ;extra copy
-    beq             done_copy_neon_loop
-
-extra_copy_neon_loop
-    vld1.8          {q0}, [r1]!                 ;load src data
-    subs            r3, r3, #16
-    vst1.8          {q0}, [r0]!
-    bne             extra_copy_neon_loop
-
-done_copy_neon_loop
-    vpop            {d8-d15}
-    bx              lr
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
deleted file mode 100644
index f82af3e..0000000
--- a/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ /dev/null

@@ -1,123 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_neon|
-    EXPORT  |vp8_get4x4sse_cs_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;============================
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;note: in this function, sum is never used. So, we can remove this part of calculation
-;from vp8_variance().
-
-|vp8_mse16x16_neon| PROC
-    vpush           {q7}
-
-    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
-    vmov.i8         q8, #0
-    vmov.i8         q9, #0
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-mse16x16_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vmlal.s16       q7, d22, d22
-    vmlal.s16       q8, d23, d23
-
-    subs            r12, r12, #1
-
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vmlal.s16       q7, d26, d26
-    vmlal.s16       q8, d27, d27
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             mse16x16_neon_loop
-
-    vadd.u32        q7, q7, q8
-    vadd.u32        q9, q9, q10
-
-    ldr             r12, [sp, #16]              ;load *sse from stack
-
-    vadd.u32        q10, q7, q9
-    vpaddl.u32      q1, q10
-    vadd.u64        d0, d2, d3
-
-    vst1.32         {d0[0]}, [r12]
-    vmov.32         r0, d0[0]
-
-    vpop            {q7}
-    bx              lr
-
-    ENDP
-
-
-;=============================
-; r0    unsigned char *src_ptr,
-; r1    int  source_stride,
-; r2    unsigned char *ref_ptr,
-; r3    int  recon_stride
-|vp8_get4x4sse_cs_neon| PROC
-    vpush           {q7}
-
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d1}, [r0], r1
-    vld1.8          {d5}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d3}, [r0], r1
-    vld1.8          {d7}, [r2], r3
-
-    vsubl.u8        q11, d0, d4
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vmull.s16       q7, d22, d22
-    vmull.s16       q8, d24, d24
-    vmull.s16       q9, d26, d26
-    vmull.s16       q10, d28, d28
-
-    vadd.u32        q7, q7, q8
-    vadd.u32        q9, q9, q10
-    vadd.u32        q9, q7, q9
-
-    vpaddl.u32      q1, q9
-    vadd.u64        d0, d2, d3
-
-    vmov.32         r0, d0[0]
-
-    vpop            {q7}
-    bx              lr
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
deleted file mode 100644
index 2226629..0000000
--- a/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null

@@ -1,103 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_walsh4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0   short *input,
-; r1   short *output,
-; r2   int pitch
-|vp8_short_walsh4x4_neon| PROC
-
-    vld1.16         {d0}, [r0@64], r2   ; load input
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {d2}, [r0@64], r2
-    vld1.16         {d3}, [r0@64]
-
-    ;First for-loop
-    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-
-    vmov.s32        q15, #3             ; add 3 to all values
-
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
-    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
-    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
-    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
-
-    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
-    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
-    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
-    vceq.s16        d16, d4, #0         ; a1 == 0
-    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
-
-    vadd.s16        d0, d4, d5          ; a1 + d1
-    vmvn            d16, d16            ; a1 != 0
-    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
-    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
-    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
-    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
-
-    ;Second for-loop
-    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d1, d3
-    vtrn.32         d0, d2
-    vtrn.16         d2, d3
-    vtrn.16         d0, d1
-
-    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
-    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
-    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
-    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
-
-    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
-    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
-    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
-    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
-
-    vclt.s32        q8, q0, #0
-    vclt.s32        q9, q1, #0
-    vclt.s32        q10, q2, #0
-    vclt.s32        q11, q3, #0
-
-    ; subtract -1 (or 0)
-    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
-    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
-    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
-    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
-
-    vadd.s32        q8, q0, q15         ; a2 + 3
-    vadd.s32        q9, q1, q15         ; b2 + 3
-    vadd.s32        q10, q2, q15        ; c2 + 3
-    vadd.s32        q11, q3, q15        ; d2 + 3
-
-    ; vrshrn? would add 1 << 3-1 = 2
-    vshrn.s32       d0, q8, #3
-    vshrn.s32       d1, q9, #3
-    vshrn.s32       d2, q10, #3
-    vshrn.s32       d3, q11, #3
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-    END

diff --git a/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 0000000..5ad9465
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c

@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+#include "./vp8_rtcd.h"
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+  vp8_short_walsh4x4_c(input, output, pitch);
+}
+#else
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    uint16x4_t d16u16;
+    int16x8_t q0s16, q1s16;
+    int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q15s32;
+    uint32x4_t q8u32, q9u32, q10u32, q11u32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    dEmptys16 = vdup_n_s16(0);
+    qEmptys32 = vdupq_n_s32(0);
+    q15s32 = vdupq_n_s32(3);
+
+    d0s16 = vld1_s16(input);
+    input += pitch/2;
+    d1s16 = vld1_s16(input);
+    input += pitch/2;
+    d2s16 = vld1_s16(input);
+    input += pitch/2;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+    d4s16 = vshl_n_s16(d4s16, 2);
+    d5s16 = vshl_n_s16(d5s16, 2);
+    d6s16 = vshl_n_s16(d6s16, 2);
+    d7s16 = vshl_n_s16(d7s16, 2);
+
+    d16u16 = vceq_s16(d4s16, dEmptys16);
+    d16u16 = vmvn_u16(d16u16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d3s16 = vsub_s16(d4s16, d5s16);
+    d1s16 = vadd_s16(d7s16, d6s16);
+    d2s16 = vsub_s16(d7s16, d6s16);
+
+    d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+    // Second for-loop
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp2.val[1]));  // d3
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp2.val[0]));  // d1
+
+    q8s32  = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+    q9s32  = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+    q0s32 = vaddq_s32(q8s32, q9s32);
+    q1s32 = vaddq_s32(q11s32, q10s32);
+    q2s32 = vsubq_s32(q11s32, q10s32);
+    q3s32 = vsubq_s32(q8s32, q9s32);
+
+    q8u32  = vcltq_s32(q0s32, qEmptys32);
+    q9u32  = vcltq_s32(q1s32, qEmptys32);
+    q10u32 = vcltq_s32(q2s32, qEmptys32);
+    q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+    q8s32  = vreinterpretq_s32_u32(q8u32);
+    q9s32  = vreinterpretq_s32_u32(q9u32);
+    q10s32 = vreinterpretq_s32_u32(q10u32);
+    q11s32 = vreinterpretq_s32_u32(q11u32);
+
+    q0s32 = vsubq_s32(q0s32, q8s32);
+    q1s32 = vsubq_s32(q1s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q3s32 = vsubq_s32(q3s32, q11s32);
+
+    q8s32  = vaddq_s32(q0s32, q15s32);
+    q9s32  = vaddq_s32(q1s32, q15s32);
+    q10s32 = vaddq_s32(q2s32, q15s32);
+    q11s32 = vaddq_s32(q3s32, q15s32);
+
+    d0s16 = vshrn_n_s32(q8s32, 3);
+    d1s16 = vshrn_n_s32(q9s32, 3);
+    d2s16 = vshrn_n_s32(q10s32, 3);
+    d3s16 = vshrn_n_s32(q11s32, 3);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+#endif  // VPX_INCOMPATIBLE_GCC

diff --git a/libvpx/vp8/encoder/arm/quantize_arm.c b/libvpx/vp8/encoder/arm/quantize_arm.c
deleted file mode 100644
index 80d9ad0..0000000
--- a/libvpx/vp8/encoder/arm/quantize_arm.c
+++ /dev/null

@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/encoder/block.h"
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_NEON
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 16; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if(has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if (has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-}
-
-#endif /* HAVE_NEON */

diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 9d0e69c..ea279b3 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c

@@ -159,7 +159,7 @@
     );
 }
 
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *stop = p + xcount;
     unsigned int split;
@@ -374,7 +374,7 @@
 
 }
 
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data,
                                           unsigned char * cx_data_end,
                                           int num_part)
 {
@@ -398,7 +398,7 @@
             const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
             int tokens = (int)(stop - p);
 
-            vp8_pack_tokens_c(w, p, tokens);
+            vp8_pack_tokens(w, p, tokens);
         }
 
         vp8_stop_encode(w);
@@ -407,7 +407,7 @@
 }
 
 
-static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
 {
     int mb_row;
 
@@ -417,7 +417,7 @@
         const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
         int tokens = (int)(stop - p);
 
-        vp8_pack_tokens_c(w, p, tokens);
+        vp8_pack_tokens(w, p, tokens);
     }
 
 }
@@ -1543,7 +1543,7 @@
     if (pc->refresh_entropy_probs == 0)
     {
         /* save a copy for later refresh */
-        vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+        memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
     }
 
     vp8_update_coef_probs(cpi);
@@ -1620,7 +1620,7 @@
             /* concatenate partition buffers */
             for(i = 0; i < num_part; i++)
             {
-                vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
                 cpi->partition_d[i+1] = dp;
                 dp += cpi->partition_sz[i+1];
             }
@@ -1676,7 +1676,7 @@
             pack_mb_row_tokens(cpi, &cpi->bc[1]);
         else
 #endif
-            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+            vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
 
         vp8_stop_encode(&cpi->bc[1]);
 

diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h
index 66f4bf6..de69805 100644
--- a/libvpx/vp8/encoder/bitstream.h
+++ b/libvpx/vp8/encoder/bitstream.h

@@ -16,36 +16,7 @@
 extern "C" {
 #endif
 
-#if HAVE_EDSP
-void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             vp8_token *,
-                             const vp8_extra_bit_struct *,
-                             const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
-                                             unsigned char * cx_data,
-                                             const unsigned char *cx_data_end,
-                                             int num_parts,
-                                             vp8_token *,
-                                             const vp8_extra_bit_struct *,
-                                             const vp8_tree_index *);
-void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    vp8_token *,
-                                    const vp8_extra_bit_struct *,
-                                    const vp8_tree_index *);
-# define pack_tokens(a,b,c)                  \
-    vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,c,d)  \
-    vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_mb_row_tokens(a,b)               \
-    vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-#else
-
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
-
-# define pack_tokens(a,b,c)                    vp8_pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
-# define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
-#endif
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp8/encoder/block.h b/libvpx/vp8/encoder/block.h
index 1f212ca..248e795 100644
--- a/libvpx/vp8/encoder/block.h
+++ b/libvpx/vp8/encoder/block.h

@@ -125,6 +125,8 @@
 
     int optimize;
     int q_index;
+    int is_skin;
+    int denoise_zeromv;
 
 #if CONFIG_TEMPORAL_DENOISING
     int increase_denoising;
@@ -160,8 +162,9 @@
     void (*short_fdct8x4)(short *input, short *output, int pitch);
     void (*short_walsh4x4)(short *input, short *output, int pitch);
     void (*quantize_b)(BLOCK *b, BLOCKD *d);
-    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
 
+    unsigned int mbs_zero_last_dot_suppress;
+    int zero_last_dot_suppress;
 } MACROBLOCK;
 
 

diff --git a/libvpx/vp8/encoder/dct.c b/libvpx/vp8/encoder/dct.c
index 091554a..0c7198d 100644
--- a/libvpx/vp8/encoder/dct.c
+++ b/libvpx/vp8/encoder/dct.c

@@ -11,6 +11,8 @@
 
 #include <math.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 {
     int i;

diff --git a/libvpx/vp8/encoder/denoising.c b/libvpx/vp8/encoder/denoising.c
index 75401fc..d197f8f 100644
--- a/libvpx/vp8/encoder/denoising.c
+++ b/libvpx/vp8/encoder/denoising.c

@@ -68,6 +68,10 @@
     int adj_val[3] = {3, 4, 6};
     int shift_inc1 = 0;
     int shift_inc2 = 1;
+    int col_sum[16] = {0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
     /* If motion_magnitude is small, making the denoiser more aggressive by
      * increasing the adjustment for each level. Add another increment for
      * blocks that are labeled for increase denoising. */
@@ -98,11 +102,11 @@
             if (absdiff <= 3 + shift_inc1)
             {
                 running_avg_y[c] = mc_running_avg_y[c];
-                sum_diff += diff;
+                col_sum[c] += diff;
             }
             else
             {
-                if (absdiff >= 4 && absdiff <= 7)
+                if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
                     adjustment = adj_val[0];
                 else if (absdiff >= 8 && absdiff <= 15)
                     adjustment = adj_val[1];
@@ -116,7 +120,7 @@
                     else
                         running_avg_y[c] = sig[c] + adjustment;
 
-                    sum_diff += adjustment;
+                    col_sum[c] += adjustment;
                 }
                 else
                 {
@@ -125,7 +129,7 @@
                     else
                         running_avg_y[c] = sig[c] - adjustment;
 
-                    sum_diff -= adjustment;
+                    col_sum[c] -= adjustment;
                 }
             }
         }
@@ -136,6 +140,23 @@
         running_avg_y += avg_y_stride;
     }
 
+    for (c = 0; c < 16; ++c) {
+      // Below we clip the value in the same way which SSE code use.
+      // When adopting aggressive denoiser, the adj_val for each pixel
+      // could be at most 8 (this is current max adjustment of the map).
+      // In SSE code, we calculate the sum of adj_val for
+      // the columns, so the sum could be upto 128(16 rows). However,
+      // the range of the value is -128 ~ 127 in SSE code, that's why
+      // we do this change in C code.
+      // We don't do this for UV denoiser, since there are only 8 rows,
+      // and max adjustments <= 8, so the sum of the columns will not
+      // exceed 64.
+      if (col_sum[c] >= 128) {
+        col_sum[c] = 127;
+      }
+      sum_diff += col_sum[c];
+    }
+
     sum_diff_thresh= SUM_DIFF_THRESHOLD;
     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
     if (abs(sum_diff) > sum_diff_thresh) {
@@ -166,14 +187,14 @@
                 running_avg_y[c] = 0;
               else
                 running_avg_y[c] = running_avg_y[c] - adjustment;
-              sum_diff -= adjustment;
+              col_sum[c] -= adjustment;
             } else if (diff < 0) {
               // Bring denoised signal up.
               if (running_avg_y[c] + adjustment > 255)
                 running_avg_y[c] = 255;
               else
                 running_avg_y[c] = running_avg_y[c] + adjustment;
-              sum_diff += adjustment;
+              col_sum[c] += adjustment;
             }
           }
           // TODO(marpan): Check here if abs(sum_diff) has gone below the
@@ -182,6 +203,15 @@
           mc_running_avg_y += mc_avg_y_stride;
           running_avg_y += avg_y_stride;
         }
+
+        sum_diff = 0;
+        for (c = 0; c < 16; ++c) {
+          if (col_sum[c] >= 128) {
+            col_sum[c] = 127;
+          }
+          sum_diff += col_sum[c];
+        }
+
         if (abs(sum_diff) > sum_diff_thresh)
           return COPY_BLOCK;
       } else {
@@ -341,8 +371,10 @@
     denoiser->denoiser_mode = kDenoiserOnYOnly;
   } else if (mode == 2) {
     denoiser->denoiser_mode = kDenoiserOnYUV;
-  } else {
+  } else if (mode == 3) {
     denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
+  } else {
+    denoiser->denoiser_mode = kDenoiserOnYUV;
   }
   if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
     denoiser->denoise_pars.scale_sse_thresh = 1;
@@ -352,14 +384,16 @@
     denoiser->denoise_pars.pickmode_mv_bias = 100;
     denoiser->denoise_pars.qp_thresh = 0;
     denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+    denoiser->denoise_pars.spatial_blur = 0;
   } else {
     denoiser->denoise_pars.scale_sse_thresh = 2;
     denoiser->denoise_pars.scale_motion_thresh = 16;
     denoiser->denoise_pars.scale_increase_filter = 1;
     denoiser->denoise_pars.denoise_mv_bias = 60;
-    denoiser->denoise_pars.pickmode_mv_bias = 60;
-    denoiser->denoise_pars.qp_thresh = 100;
-    denoiser->denoise_pars.consec_zerolast = 10;
+    denoiser->denoise_pars.pickmode_mv_bias = 75;
+    denoiser->denoise_pars.qp_thresh = 80;
+    denoiser->denoise_pars.consec_zerolast = 15;
+    denoiser->denoise_pars.spatial_blur = 0;
   }
 }
 
@@ -381,8 +415,8 @@
             vp8_denoiser_free(denoiser);
             return 1;
         }
-        vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
-                   denoiser->yv12_running_avg[i].frame_size);
+        memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+               denoiser->yv12_running_avg[i].frame_size);
 
     }
     denoiser->yv12_mc_running_avg.flags = 0;
@@ -394,12 +428,43 @@
         return 1;
     }
 
-    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
-               denoiser->yv12_mc_running_avg.frame_size);
+    memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+           denoiser->yv12_mc_running_avg.frame_size);
+
+    if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
+                                    height, VP8BORDERINPIXELS) < 0) {
+      vp8_denoiser_free(denoiser);
+      return 1;
+    }
+    memset(denoiser->yv12_last_source.buffer_alloc, 0,
+           denoiser->yv12_last_source.frame_size);
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
-    vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+    memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
     vp8_denoiser_set_parameters(denoiser, mode);
+    denoiser->nmse_source_diff = 0;
+    denoiser->nmse_source_diff_count = 0;
+    denoiser->qp_avg = 0;
+    // QP threshold below which we can go up to aggressive mode.
+    denoiser->qp_threshold_up = 80;
+    // QP threshold above which we can go back down to normal mode.
+    // For now keep this second threshold high, so not used currently.
+    denoiser->qp_threshold_down = 128;
+    // Bitrate thresholds and noise metric (nmse) thresholds for switching to
+    // aggressive mode.
+    // TODO(marpan): Adjust thresholds, including effect on resolution.
+    denoiser->bitrate_threshold = 400000;  // (bits/sec).
+    denoiser->threshold_aggressive_mode = 80;
+    if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 3000000;
+      denoiser->threshold_aggressive_mode = 200;
+    } else if (width * height > 960 * 540) {
+      denoiser->bitrate_threshold = 1200000;
+      denoiser->threshold_aggressive_mode = 120;
+    } else if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 600000;
+      denoiser->threshold_aggressive_mode = 100;
+    }
     return 0;
 }
 
@@ -414,10 +479,10 @@
         vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
     }
     vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
     vpx_free(denoiser->denoise_state);
 }
 
-
 void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              MACROBLOCK *x,
                              unsigned int best_sse,
@@ -488,6 +553,7 @@
              * Note that any changes to the mode info only affects the
              * denoising.
              */
+            x->denoise_zeromv = 1;
             mbmi->ref_frame =
                     x->best_zeromv_reference_frame;
 
@@ -537,6 +603,12 @@
     motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
         NOISE_MOTION_THRESHOLD;
 
+    // If block is considered to be skin area, lower the motion threshold.
+    // In current version set threshold = 1, so only denoise very low
+    // (i.e., zero) mv on skin.
+    if (x->is_skin)
+        motion_threshold = 1;
+
     if (motion_magnitude2 <
         denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
       x->increase_denoising = 1;
@@ -596,6 +668,7 @@
         /* No filtering of this block; it differs too much from the predictor,
          * or the motion vector magnitude is considered too big.
          */
+        x->denoise_zeromv = 0;
         vp8_copy_mem16x16(
                 x->thismb, 16,
                 denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
@@ -626,7 +699,7 @@
       int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
 
       // Fix filter level to some nominal value for now.
-      int filter_level = 32;
+      int filter_level = 48;
 
       int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
       lfi.mblim = lfi_n->mblim[filter_level];

diff --git a/libvpx/vp8/encoder/denoising.h b/libvpx/vp8/encoder/denoising.h
index 89832d3..9a379a6 100644
--- a/libvpx/vp8/encoder/denoising.h
+++ b/libvpx/vp8/encoder/denoising.h

@@ -19,14 +19,16 @@
 #endif
 
 #define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
+#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
 #define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
-#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
 #define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
 
+#define MAX_GF_ARF_DENOISE_RANGE (8)
+
 enum vp8_denoiser_decision
 {
   COPY_BLOCK,
@@ -43,7 +45,8 @@
   kDenoiserOff,
   kDenoiserOnYOnly,
   kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
 };
 
 typedef struct {
@@ -66,15 +69,26 @@
   unsigned int qp_thresh;
   // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
   unsigned int consec_zerolast;
+  // Threshold for amount of spatial blur on Y channel. 0 means no spatial blur.
+  unsigned int spatial_blur;
 } denoise_params;
 
 typedef struct vp8_denoiser
 {
     YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
     YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
+    YV12_BUFFER_CONFIG yv12_last_source;
     unsigned char* denoise_state;
     int num_mb_cols;
     int denoiser_mode;
+    int threshold_aggressive_mode;
+    int nmse_source_diff;
+    int nmse_source_diff_count;
+    int qp_avg;
+    int qp_threshold_up;
+    int qp_threshold_down;
+    int bitrate_threshold;
     denoise_params denoise_pars;
 } VP8_DENOISER;
 
@@ -83,6 +97,8 @@
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser);
 
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
+
 void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              MACROBLOCK *x,
                              unsigned int best_sse,

diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
index aec6b98..d381d8d 100644
--- a/libvpx/vp8/encoder/encodeframe.c
+++ b/libvpx/vp8/encoder/encodeframe.c

@@ -11,6 +11,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "encodemb.h"
 #include "encodemv.h"
 #include "vp8/common/common.h"
@@ -82,6 +83,7 @@
 {
     unsigned int act;
     unsigned int sse;
+    (void)cpi;
     /* TODO: This could also be done over smaller areas (8x8), but that would
      *  require extensive changes elsewhere, as lambda is assumed to be fixed
      *  over an entire MB in most of the code.
@@ -89,7 +91,7 @@
      *  lambda using a non-linear combination (e.g., the smallest, or second
      *  smallest, etc.).
      */
-    act =  vp8_variance16x16(x->src.y_buffer,
+    act =  vpx_variance16x16(x->src.y_buffer,
                     x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
     act = act<<4;
 
@@ -154,8 +156,8 @@
                         cpi->common.MBs));
 
         /* Copy map to sort list */
-        vpx_memcpy( sortlist, cpi->mb_activity_map,
-                    sizeof(unsigned int) * cpi->common.MBs );
+        memcpy( sortlist, cpi->mb_activity_map,
+                sizeof(unsigned int) * cpi->common.MBs );
 
 
         /* Ripple each value down to its correct position */
@@ -522,7 +524,8 @@
             }
 
 #endif
-            // Keep track of how many (consecutive) times a block is coded
+
+            // Keep track of how many (consecutive) times a  block is coded
             // as ZEROMV_LASTREF, for base layer frames.
             // Reset to 0 if its coded as anything else.
             if (cpi->current_layer == 0) {
@@ -531,9 +534,14 @@
                 // Increment, check for wrap-around.
                 if (cpi->consec_zero_last[map_index+mb_col] < 255)
                   cpi->consec_zero_last[map_index+mb_col] += 1;
+                if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                  cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
               } else {
                 cpi->consec_zero_last[map_index+mb_col] = 0;
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
               }
+              if (x->zero_last_dot_suppress)
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
             }
 
             /* Special case code for cyclic refresh
@@ -574,7 +582,7 @@
         /* pack tokens for this MB */
         {
             int tok_count = *tp - tp_start;
-            pack_tokens(w, tp_start, tok_count);
+            vp8_pack_tokens(w, tp_start, tok_count);
         }
 #endif
         /* Increment pointer into gf usage flags structure. */
@@ -658,8 +666,7 @@
 
     x->mvc = cm->fc.mvc;
 
-    vpx_memset(cm->above_context, 0,
-               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+    memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
 
     /* Special case treatment when GF and ARF are not sensible options
      * for reference
@@ -737,7 +744,7 @@
     const int num_part = (1 << cm->multi_token_partition);
 #endif
 
-    vpx_memset(segment_counts, 0, sizeof(segment_counts));
+    memset(segment_counts, 0, sizeof(segment_counts));
     totalrate = 0;
 
     if (cpi->compressor_speed == 2)
@@ -967,7 +974,7 @@
         int i;
 
         /* Set to defaults */
-        vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+        memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
 
         tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
 
@@ -1143,6 +1150,8 @@
         while (++b < 16);
     }
 
+#else
+    (void)cpi;
 #endif
 
     ++x->ymode_count[m];
@@ -1252,7 +1261,6 @@
         if(cpi->sf.use_fastquant_for_pick)
         {
             x->quantize_b      = vp8_fast_quantize_b;
-            x->quantize_b_pair = vp8_fast_quantize_b_pair;
 
             /* the fast quantizer does not use zbin_extra, so
              * do not recalculate */
@@ -1265,7 +1273,6 @@
         if (cpi->sf.improved_quant)
         {
             x->quantize_b      = vp8_regular_quantize_b;
-            x->quantize_b_pair = vp8_regular_quantize_b_pair;
         }
 
         /* restore cpi->zbin_mode_boost_enabled */

diff --git a/libvpx/vp8/encoder/encodeintra.c b/libvpx/vp8/encoder/encodeintra.c
index cfa4cb9..938cc7e 100644
--- a/libvpx/vp8/encoder/encodeintra.c
+++ b/libvpx/vp8/encoder/encodeintra.c

@@ -11,7 +11,8 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#include "quantize.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/encoder/quantize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
 #include "vp8/common/invtrans.h"
@@ -44,7 +45,7 @@
         }
     }
 
-    intra_pred_var = vp8_get_mb_ss(x->src_diff);
+    intra_pred_var = vpx_get_mb_ss(x->src_diff);
 
     return intra_pred_var;
 }

diff --git a/libvpx/vp8/encoder/encodemb.c b/libvpx/vp8/encoder/encodemb.c
index 7ed2fe1..932a157 100644
--- a/libvpx/vp8/encoder/encodemb.c
+++ b/libvpx/vp8/encoder/encodemb.c

@@ -8,89 +8,41 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "tokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"
 
-void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *src_ptr = (*(be->base_src) + be->src);
-    short *diff_ptr = be->src_diff;
-    unsigned char *pred_ptr = bd->predictor;
-    int src_stride = be->src_stride;
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
+  short *diff_ptr = be->src_diff;
+  unsigned char *pred_ptr = bd->predictor;
+  int src_stride = be->src_stride;
 
-    int r, c;
-
-    for (r = 0; r < 4; r++)
-    {
-        for (c = 0; c < 4; c++)
-        {
-            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-        }
-
-        diff_ptr += pitch;
-        pred_ptr += pitch;
-        src_ptr  += src_stride;
-    }
+  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
+                     pred_ptr, pitch);
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
                          int src_stride, unsigned char *upred,
-                         unsigned char *vpred, int pred_stride)
-{
-    short *udiff = diff + 256;
-    short *vdiff = diff + 320;
+                         unsigned char *vpred, int pred_stride) {
+  short *udiff = diff + 256;
+  short *vdiff = diff + 320;
 
-    int r, c;
-
-    for (r = 0; r < 8; r++)
-    {
-        for (c = 0; c < 8; c++)
-        {
-            udiff[c] = usrc[c] - upred[c];
-        }
-
-        udiff += 8;
-        upred += pred_stride;
-        usrc  += src_stride;
-    }
-
-    for (r = 0; r < 8; r++)
-    {
-        for (c = 0; c < 8; c++)
-        {
-            vdiff[c] = vsrc[c] - vpred[c];
-        }
-
-        vdiff += 8;
-        vpred += pred_stride;
-        vsrc  += src_stride;
-    }
+  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
+  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
-                        unsigned char *pred, int pred_stride)
-{
-    int r, c;
-
-    for (r = 0; r < 16; r++)
-    {
-        for (c = 0; c < 16; c++)
-        {
-            diff[c] = src[c] - pred[c];
-        }
-
-        diff += 16;
-        pred += pred_stride;
-        src  += src_stride;
-    }
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride) {
+  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
 }
 
 static void vp8_subtract_mb(MACROBLOCK *x)
@@ -258,13 +210,6 @@
     b = &mb->block[ib];
     d = &mb->e_mbd.block[ib];
 
-    /* Enable this to test the effect of RDO as a replacement for the dynamic
-     *  zero bin instead of an augmentation of it.
-     */
-#if 0
-    vp8_strict_quantize_b(b, d);
-#endif
-
     dequant_ptr = d->dequant;
     coeff_ptr = b->coeff;
     qcoeff_ptr = d->qcoeff;
@@ -513,8 +458,8 @@
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -562,8 +507,8 @@
     if (!x->e_mbd.left_context)
         return;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -602,8 +547,8 @@
     if (!x->e_mbd.left_context)
         return;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;

diff --git a/libvpx/vp8/encoder/encodemb.h b/libvpx/vp8/encoder/encodemb.h
index 0b3ec87..10b3d86 100644
--- a/libvpx/vp8/encoder/encodemb.h
+++ b/libvpx/vp8/encoder/encodemb.h

@@ -19,6 +19,13 @@
 #endif
 void vp8_encode_inter16x16(MACROBLOCK *x);
 
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                       int src_stride, unsigned char *upred,
+                       unsigned char *vpred, int pred_stride);
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride);
+
 void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);

diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
index 7b8b51f..4e234cc 100644
--- a/libvpx/vp8/encoder/ethreading.c
+++ b/libvpx/vp8/encoder/ethreading.c

@@ -19,8 +19,6 @@
 
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
 
-extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
-
 static THREAD_FUNCTION thread_loopfilter(void *p_data)
 {
     VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
@@ -215,11 +213,15 @@
                                   LAST_FRAME) {
                             // Increment, check for wrap-around.
                             if (cpi->consec_zero_last[map_index+mb_col] < 255)
-                              cpi->consec_zero_last[map_index+mb_col] +=
-                                  1;
+                              cpi->consec_zero_last[map_index+mb_col] += 1;
+                            if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                              cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
                           } else {
                             cpi->consec_zero_last[map_index+mb_col] = 0;
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
                           }
+                          if (x->zero_last_dot_suppress)
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
                         }
 
                         /* Special case code for cyclic refresh
@@ -261,7 +263,7 @@
                     /* pack tokens for this MB */
                     {
                         int tok_count = tp - tp_start;
-                        pack_tokens(w, tp_start, tok_count);
+                        vp8_pack_tokens(w, tp_start, tok_count);
                     }
 #else
                     cpi->tplist[mb_row].stop = tp;
@@ -346,7 +348,6 @@
     z->short_fdct8x4     = x->short_fdct8x4;
     z->short_walsh4x4    = x->short_walsh4x4;
     z->quantize_b        = x->quantize_b;
-    z->quantize_b_pair   = x->quantize_b_pair;
     z->optimize          = x->optimize;
 
     /*
@@ -413,14 +414,13 @@
         zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
         zd->segmentation_enabled     = xd->segmentation_enabled;
         zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
-        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data,
-                   sizeof(xd->segment_feature_data));
+        memcpy(zd->segment_feature_data, xd->segment_feature_data,
+               sizeof(xd->segment_feature_data));
 
-        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc,
-                   sizeof(xd->dequant_y1_dc));
-        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
-        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
-        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+        memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
 #if 1
         /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
@@ -435,15 +435,14 @@
 #endif
 
 
-        vpx_memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
-        vpx_memcpy(z->rd_thresh_mult, x->rd_thresh_mult,
-                   sizeof(x->rd_thresh_mult));
+        memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+        memcpy(z->rd_thresh_mult, x->rd_thresh_mult, sizeof(x->rd_thresh_mult));
 
         z->zbin_over_quant = x->zbin_over_quant;
         z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
         z->zbin_mode_boost = x->zbin_mode_boost;
 
-        vpx_memset(z->error_bins, 0, sizeof(z->error_bins));
+        memset(z->error_bins, 0, sizeof(z->error_bins));
     }
 }
 
@@ -469,7 +468,7 @@
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
         mb->gf_active_ptr            = x->gf_active_ptr;
 
-        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+        memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
         mbr_ei[i].totalrate = 0;
 
         mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
@@ -506,6 +505,7 @@
         mb->intra_error = 0;
         vp8_zero(mb->count_mb_ref_frame_usage);
         mb->mbs_tested_so_far = 0;
+        mb->mbs_zero_last_dot_suppress = 0;
     }
 }
 
@@ -543,7 +543,7 @@
                         vpx_malloc(sizeof(sem_t) * th_count));
         CHECK_MEM_ERROR(cpi->mb_row_ei,
                         vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
         CHECK_MEM_ERROR(cpi->en_thread_data,
                         vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 

diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index 98e5a71..4c2acc7 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c

@@ -12,10 +12,11 @@
 #include <limits.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodeintra.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
@@ -34,8 +35,6 @@
 /* #define OUTPUT_FPF 1 */
 
 extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
-extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
-extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
 
 #define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
 extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
@@ -132,6 +131,7 @@
                          FIRSTPASS_STATS            *stats)
 {
     struct vpx_codec_cx_pkt pkt;
+    (void)cpi;
     pkt.kind = VPX_CODEC_STATS_PKT;
     pkt.data.twopass_stats.buf = stats;
     pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
@@ -418,18 +418,19 @@
     int raw_stride = raw_buffer->y_stride;
     unsigned char *ref_ptr;
     int ref_stride = x->e_mbd.pre.y_stride;
+    (void)cpi;
 
     /* Set up pointers for this macro block raw buffer */
     raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
                                 + d->offset);
-    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
-                   (unsigned int *)(raw_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
+                 (unsigned int *)(raw_motion_err));
 
     /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
     ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
-    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
-                   (unsigned int *)(best_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+                 (unsigned int *)(best_motion_err));
 }
 
 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@@ -453,7 +454,7 @@
     int new_mv_mode_penalty = 256;
 
     /* override the default variance function to use MSE */
-    v_fn_ptr.vf    = vp8_mse16x16;
+    v_fn_ptr.vf    = vpx_mse16x16;
 
     /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
@@ -571,7 +572,7 @@
     {
         int flag[2] = {1, 1};
         vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
-        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
     }
 
@@ -1327,8 +1328,6 @@
     return Q;
 }
 
-extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);
-
 void vp8_init_second_pass(VP8_COMP *cpi)
 {
     FIRSTPASS_STATS this_frame;
@@ -1409,6 +1408,7 @@
 
 void vp8_end_second_pass(VP8_COMP *cpi)
 {
+  (void)cpi;
 }
 
 /* This function gives and estimate of how badly we believe the prediction
@@ -1419,6 +1419,7 @@
     double prediction_decay_rate;
     double motion_decay;
     double motion_pct = next_frame->pcnt_motion;
+    (void)cpi;
 
     /* Initial basis is the % mbs inter coded */
     prediction_decay_rate = next_frame->pcnt_inter;
@@ -1547,6 +1548,7 @@
     double this_frame_mvr_ratio;
     double this_frame_mvc_ratio;
     double motion_pct;
+    (void)cpi;
 
     /* Accumulate motion stats. */
     motion_pct = this_frame->pcnt_motion;
@@ -1774,7 +1776,7 @@
 
     start_pos = cpi->twopass.stats_in;
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+    memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
 
     /* Load stats for the current frame. */
     mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1870,7 +1872,7 @@
             break;
         }
 
-        vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+        memcpy(this_frame, &next_frame, sizeof(*this_frame));
 
         old_boost_score = boost_score;
     }
@@ -2440,7 +2442,7 @@
     if (cpi->twopass.frames_to_key == 0)
     {
         /* Define next KF group and assign bits to it */
-        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         find_next_key_frame(cpi, &this_frame_copy);
 
         /* Special case: Error error_resilient_mode mode does not make much
@@ -2466,7 +2468,7 @@
     if (cpi->frames_till_gf_update_due == 0)
     {
         /* Define next gf group and assign bits to it */
-        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         define_gf_group(cpi, &this_frame_copy);
 
         /* If we are going to code an altref frame at the end of the group
@@ -2482,7 +2484,7 @@
              * to the GF group
              */
             int bak = cpi->per_frame_bandwidth;
-            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
             cpi->per_frame_bandwidth = bak;
         }
@@ -2505,14 +2507,14 @@
             if (cpi->common.frame_type != KEY_FRAME)
             {
                 /* Assign bits from those allocated to the GF group */
-                vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+                memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
                 assign_std_frame_bits(cpi, &this_frame_copy);
             }
         }
         else
         {
             /* Assign bits from those allocated to the GF group */
-            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
         }
     }
@@ -2653,7 +2655,7 @@
         double decay_accumulator = 1.0;
         double next_iiratio;
 
-        vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+        memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
 
         /* Note the starting file position so we can reset to it */
         start_pos = cpi->twopass.stats_in;
@@ -2730,7 +2732,7 @@
     double kf_group_coded_err = 0.0;
     double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame));
+    memset(&next_frame, 0, sizeof(next_frame));
 
     vp8_clear_system_state();
     start_position = cpi->twopass.stats_in;
@@ -2751,7 +2753,7 @@
     cpi->twopass.frames_to_key = 1;
 
     /* Take a copy of the initial frame details */
-    vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+    memcpy(&first_frame, this_frame, sizeof(*this_frame));
 
     cpi->twopass.kf_group_bits = 0;
     cpi->twopass.kf_group_error_left = 0;
@@ -2774,7 +2776,7 @@
         kf_group_coded_err += this_frame->coded_error;
 
         /* Load the next frame's stats. */
-        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+        memcpy(&last_frame, this_frame, sizeof(*this_frame));
         input_stats(cpi, this_frame);
 
         /* Provided that we are not at the end of the file... */
@@ -2842,7 +2844,7 @@
         cpi->twopass.frames_to_key /= 2;
 
         /* Copy first frame details */
-        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+        memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
 
         /* Reset to the start of the group */
         reset_fpf_position(cpi, start_position);
@@ -2964,7 +2966,6 @@
      */
     decay_accumulator = 1.0;
     boost_score = 0.0;
-    loop_decay_rate = 1.00;       /* Starting decay rate */
 
     for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
     {
@@ -3208,7 +3209,7 @@
         int new_width = cpi->oxcf.Width;
         int new_height = cpi->oxcf.Height;
 
-        int projected_buffer_level = (int)cpi->buffer_level;
+        int projected_buffer_level;
         int tmp_q;
 
         double projected_bits_perframe;

diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index 54abe76..f848e8f 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c

@@ -9,6 +9,8 @@
  */
 
 
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "mcomp.h"
 #include "vpx_mem/vpx_mem.h"
@@ -393,8 +395,8 @@
 #endif
 
     /* central mv */
-    bestmv->as_mv.row <<= 3;
-    bestmv->as_mv.col <<= 3;
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
     startmv = *bestmv;
 
     /* calculate central point error */
@@ -888,6 +890,8 @@
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
+    (void)mvcost;
+
     /* adjust ref_mv to make sure it is within MV range */
     vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
     br = ref_mv->as_mv.row;
@@ -898,7 +902,7 @@
     this_offset = base_offset + (br * (pre_stride)) + bc;
     this_mv.as_mv.row = br;
     this_mv.as_mv.col = bc;
-    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
+    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride)
             + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
 #if CONFIG_MULTI_RES_ENCODING
@@ -911,6 +915,8 @@
     else if (search_param >= 1) hex_range = 63;
 
     dia_range = 8;
+#else
+    (void)search_param;
 #endif
 
     /* hex search */
@@ -923,7 +929,7 @@
             this_mv.as_mv.row = br + hex[i].row;
             this_mv.as_mv.col = bc + hex[i].col;
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
             CHECK_BETTER
         }
     }else
@@ -934,7 +940,7 @@
             this_mv.as_mv.col = bc + hex[i].col;
             CHECK_POINT
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
             CHECK_BETTER
         }
     }
@@ -960,7 +966,7 @@
                 this_mv.as_mv.row = br + next_chkpts[k][i].row;
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }else
@@ -971,7 +977,7 @@
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }
@@ -1002,7 +1008,7 @@
                 this_mv.as_mv.row = br + neighbors[i].row;
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }else
@@ -1013,7 +1019,7 @@
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }
@@ -1097,7 +1103,7 @@
     best_address = in_what;
 
     /* Check the starting position */
-    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* search_param determines the length of the initial step and hence
@@ -1122,7 +1128,7 @@
 
             {
                 check_here = ss[i].offset + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
                 if (thissad < bestsad)
                 {
@@ -1221,7 +1227,7 @@
     best_address = in_what;
 
     /* Check the starting position */
-    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* search_param determines the length of the initial step and hence the
@@ -1289,7 +1295,7 @@
                 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
                     check_here = ss[i].offset + best_address;
-                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
                     if (thissad < bestsad)
                     {
@@ -1372,8 +1378,7 @@
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that
@@ -1398,7 +1403,7 @@
 
         for (c = col_min; c < col_max; c++)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
             this_mv.as_mv.col = c;
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
@@ -1470,8 +1475,7 @@
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that stretch
@@ -1527,7 +1531,7 @@
 
         while (c < col_max)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
             if (thissad < bestsad)
             {
@@ -1586,7 +1590,8 @@
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+    // TODO(johannkoenig): check if this alignment is necessary.
+    DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
     unsigned int sad_array[3];
 
     int *mvsadcost[2];
@@ -1605,8 +1610,7 @@
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride,
-                          bestaddress, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that stretch
@@ -1692,7 +1696,7 @@
 
         while (c < col_max)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
             if (thissad < bestsad)
             {
@@ -1750,8 +1754,7 @@
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
             + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
@@ -1767,7 +1770,7 @@
             (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
             {
                 check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
                 if (thissad < bestsad)
                 {
@@ -1830,8 +1833,7 @@
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
             + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
@@ -1882,7 +1884,7 @@
                 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
                     check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
-                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
                     if (thissad < bestsad)
                     {
@@ -1974,8 +1976,8 @@
 #ifdef VP8_ENTROPY_STATS
 void init_mv_ref_counts()
 {
-    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+    memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
 }
 
 void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])

diff --git a/libvpx/vp8/encoder/mcomp.h b/libvpx/vp8/encoder/mcomp.h
index f284f7c..1694af8 100644
--- a/libvpx/vp8/encoder/mcomp.h
+++ b/libvpx/vp8/encoder/mcomp.h

@@ -13,7 +13,7 @@
 #define VP8_ENCODER_MCOMP_H_
 
 #include "block.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/libvpx/vp8/encoder/mips/msa/dct_msa.c b/libvpx/vp8/encoder/mips/msa/dct_msa.c
new file mode 100644
index 0000000..be61ffa
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/msa/dct_msa.c

@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                   \
+    v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m;                   \
+                                                                    \
+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
+    ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m);                          \
+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
+    ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m);                          \
+    PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2);            \
+    PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3);            \
+}
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)    \
+{                                                                   \
+    v8i16 tmp0_m;                                                   \
+                                                                    \
+    SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2);  \
+    ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2);    \
+}
+
+#define RET_1_IF_NZERO_H(in0)       \
+({                                  \
+    v8i16 tmp0_m;                   \
+    v8i16 one_m = __msa_ldi_h(1);   \
+                                    \
+    tmp0_m = __msa_ceqi_h(in0, 0);  \
+    tmp0_m = tmp0_m ^ 255;          \
+    tmp0_m = one_m & tmp0_m;        \
+                                    \
+    tmp0_m;                         \
+})
+
+#define RET_1_IF_NZERO_W(in0)       \
+({                                  \
+    v4i32 tmp0_m;                   \
+    v4i32 one_m = __msa_ldi_w(1);   \
+                                    \
+    tmp0_m = __msa_ceqi_w(in0, 0);  \
+    tmp0_m = tmp0_m ^ 255;          \
+    tmp0_m = one_m & tmp0_m;        \
+                                    \
+    tmp0_m;                         \
+})
+
+#define RET_1_IF_NEG_W(in0)           \
+({                                    \
+    v4i32 tmp0_m;                     \
+                                      \
+    v4i32 one_m = __msa_ldi_w(1);     \
+    tmp0_m = __msa_clti_s_w(in0, 0);  \
+    tmp0_m = one_m & tmp0_m;          \
+                                      \
+    tmp0_m;                           \
+})
+
+void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 temp0, temp1;
+    v8i16 const0, const1;
+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+    v4i32 out0, out1, out2, out3;
+    v8i16 zero = { 0 };
+
+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    SLLI_4V(temp0, temp1, in1, in3, 3);
+    in0 = temp0 + temp1;
+    in2 = temp0 - temp1;
+    SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+    temp0 = __msa_ilvr_h(in3, in1);
+    in1 = __msa_splati_h(coeff, 3);
+    out0 = (v4i32)__msa_ilvev_h(zero, in1);
+    coeff = __msa_ilvl_h(zero, coeff);
+    out1 = __msa_splati_w((v4i32)coeff, 0);
+    DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
+    out0 >>= 12;
+    out1 >>= 12;
+    PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    in0 = temp0 + temp1 + 7;
+    in2 = temp0 - temp1 + 7;
+    in0 >>= 4;
+    in2 >>= 4;
+    ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
+    temp1 = RET_1_IF_NZERO_H(in3);
+    ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
+    SPLATI_W2_SW(coeff, 2, out3, out1);
+    out3 += out1;
+    out1 = __msa_splati_w((v4i32)coeff, 1);
+    DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
+    out1 >>= 16;
+    out3 >>= 16;
+    out1 += (v4i32)temp1;
+    PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
+    ST_SH2(in0, in2, output, 8);
+}
+
+void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 temp0, temp1, tmp0, tmp1;
+    v8i16 const0, const1, const2;
+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+    v8i16 zero = { 0 };
+    v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
+
+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    SLLI_4V(temp0, temp1, in1, in3, 3);
+    in0 = temp0 + temp1;
+    in2 = temp0 - temp1;
+    SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+    temp0 = __msa_splati_h(coeff, 3);
+    vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
+    coeff = __msa_ilvl_h(zero, coeff);
+    vec3_w = __msa_splati_w((v4i32)coeff, 0);
+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+    vec0_w = vec1_w;
+    vec2_w = vec3_w;
+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
+                 vec0_w, vec1_w, vec2_w, vec3_w);
+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    in0 = temp0 + temp1 + 7;
+    in2 = temp0 - temp1 + 7;
+    in0 >>= 4;
+    in2 >>= 4;
+    SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
+    vec3_w += vec1_w;
+    vec1_w = __msa_splati_w((v4i32)coeff, 1);
+    const0 = RET_1_IF_NZERO_H(in3);
+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+    vec0_w = vec1_w;
+    vec2_w = vec3_w;
+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
+                 vec0_w, vec1_w, vec2_w, vec3_w);
+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+    in1 += const0;
+    PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
+    ST_SH2(temp0, temp1, output, 8);
+
+    PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
+    ST_SH2(in0, in2, output + 16, 8);
+}
+
+void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0_h, in1_h, in2_h, in3_h;
+    v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
+
+    LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
+    TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
+
+    UNPCK_R_SH_SW(in0_h, in0_w);
+    UNPCK_R_SH_SW(in1_h, in1_w);
+    UNPCK_R_SH_SW(in2_h, in2_w);
+    UNPCK_R_SH_SW(in3_h, in3_w);
+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+    SLLI_4V(temp0, temp1, temp2, temp3, 2);
+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+    temp0 = RET_1_IF_NZERO_W(temp0);
+    in0_w += temp0;
+    TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
+
+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+    in0_w += RET_1_IF_NEG_W(in0_w);
+    in1_w += RET_1_IF_NEG_W(in1_w);
+    in2_w += RET_1_IF_NEG_W(in2_w);
+    in3_w += RET_1_IF_NEG_W(in3_w);
+    ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
+    SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
+    PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
+    ST_SH2(in0_h, in1_h, output, 8);
+}

diff --git a/libvpx/vp8/encoder/mips/msa/denoising_msa.c b/libvpx/vp8/encoder/mips/msa/denoising_msa.c
new file mode 100644
index 0000000..66965c6
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/msa/denoising_msa.c

@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/denoising.h"
+
+int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr,
+                                int32_t mc_avg_y_stride,
+                                uint8_t *running_avg_y_ptr,
+                                int32_t avg_y_stride,
+                                uint8_t *sig_ptr, int32_t sig_stride,
+                                uint32_t motion_magnitude,
+                                int32_t increase_denoising)
+{
+    uint8_t *running_avg_y_start = running_avg_y_ptr;
+    uint8_t *sig_start = sig_ptr;
+    int32_t cnt = 0;
+    int32_t sum_diff = 0;
+    int32_t shift_inc1 = 3;
+    int32_t delta = 0;
+    int32_t sum_diff_thresh;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 mc_running_avg_y0, running_avg_y, sig0;
+    v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+    v16u8 coeff0, coeff1;
+    v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1;
+    v8i16 adjust0, adjust1, adjust2, adjust3;
+    v8i16 shift_inc1_vec = { 0 };
+    v8i16 col_sum0 = { 0 };
+    v8i16 col_sum1 = { 0 };
+    v8i16 col_sum2 = { 0 };
+    v8i16 col_sum3 = { 0 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec;
+    v4i32 temp0_w;
+    v2i64 temp0_d, temp1_d;
+    v8i16 zero = { 0 };
+    v8i16 one = __msa_ldi_h(1);
+    v8i16 four = __msa_ldi_h(4);
+    v8i16 val_127 = __msa_ldi_h(127);
+    v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+        adj_val = __msa_add_a_h(adj_val, one);
+        if (increase_denoising)
+        {
+            adj_val = __msa_add_a_h(adj_val, one);
+            shift_inc1 = 4;
+        }
+
+        temp0_h = zero - adj_val;
+        adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+    }
+
+    adj_val = __msa_insert_h(adj_val, 3, cnt);
+    adj_val = __msa_insert_h(adj_val, 7, cnt);
+    shift_inc1_vec = __msa_fill_h(shift_inc1);
+
+    for (cnt = 8; cnt--;)
+    {
+        v8i16 mask0 = { 0 };
+        v8i16 mask1 = { 0 };
+
+        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+        sig0 = LD_UB(sig_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+
+        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+        sig1 = LD_UB(sig_ptr);
+
+        ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+        abs_diff0 = __msa_add_a_h(diff0, zero);
+        abs_diff1 = __msa_add_a_h(diff1, zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 15);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 7);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = abs_diff1 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask1 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        temp1_h = __msa_clei_s_h(diff1, 0);
+        temp1_h = temp1_h & four;
+        mask1 += temp1_h;
+        VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+                   adjust1);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        temp3_h = __msa_ceqi_h(adjust1, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                     (v16u8)temp2_h);
+        adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1,
+                                     (v16u8)temp3_h);
+        ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+        UNPCK_UB_SH(sig0, temp0_h, temp1_h);
+        ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+        MAXI_SH2_SH(temp0_h, temp1_h, 0);
+        SAT_UH2_SH(temp0_h, temp1_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0,
+                                     (v16u8)temp2_h);
+        ST_UB(running_avg_y, running_avg_y_ptr);
+        running_avg_y_ptr += avg_y_stride;
+
+        mask0 = zero;
+        mask1 = zero;
+        ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+        abs_diff0 = __msa_add_a_h(diff0, zero);
+        abs_diff1 = __msa_add_a_h(diff1, zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 15);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 7);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = abs_diff1 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask1 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        temp1_h = __msa_clei_s_h(diff1, 0);
+        temp1_h = temp1_h & four;
+        mask1 += temp1_h;
+        VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+                   adjust1);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        temp3_h = __msa_ceqi_h(adjust1, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1,
+                                      (v16u8)temp3_h);
+        ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+        UNPCK_UB_SH(sig1, temp0_h, temp1_h);
+        ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+        MAXI_SH2_SH(temp0_h, temp1_h, 0);
+        SAT_UH2_SH(temp0_h, temp1_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1,
+                                     (v16u8)temp2_h);
+        ST_UB(running_avg_y, running_avg_y_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        running_avg_y_ptr += avg_y_stride;
+    }
+
+    col_sum0 = __msa_min_s_h(col_sum0, val_127);
+    col_sum1 = __msa_min_s_h(col_sum1, val_127);
+    temp0_h = col_sum0 + col_sum1;
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 16;
+    mc_running_avg_y_ptr -= mc_avg_y_stride * 16;
+    running_avg_y_ptr -= avg_y_stride * 16;
+
+    if (increase_denoising)
+    {
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+    }
+
+    if (abs(sum_diff) > sum_diff_thresh)
+    {
+        delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+        delta_vec = __msa_fill_h(delta);
+        if (delta < 4)
+        {
+            for (cnt = 8; cnt--;)
+            {
+                running_avg_y = LD_UB(running_avg_y_ptr);
+                mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+                sig0 = LD_UB(sig_ptr);
+                sig_ptr += sig_stride;
+                mc_running_avg_y_ptr += mc_avg_y_stride;
+                running_avg_y_ptr += avg_y_stride;
+                mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+                sig1 = LD_UB(sig_ptr);
+                running_avg_y1 = LD_UB(running_avg_y_ptr);
+                ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+                HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+                abs_diff0 = __msa_add_a_h(diff0, zero);
+                abs_diff1 = __msa_add_a_h(diff1, zero);
+                temp0_h = abs_diff0 < delta_vec;
+                temp1_h = abs_diff1 < delta_vec;
+                abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp0_h);
+                abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp1_h);
+                SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0,
+                     abs_diff_neg1);
+                abs_diff_neg0 = zero - abs_diff0;
+                abs_diff_neg1 = zero - abs_diff1;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                temp1_h = __msa_clei_s_h(diff1, 0);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                              (v16u8)abs_diff_neg0,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1,
+                                              (v16u8)abs_diff_neg1,
+                                              (v16u8)temp1_h);
+                ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h);
+                ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+                MAXI_SH2_SH(adjust2, adjust3, 0);
+                SAT_UH2_SH(adjust2, adjust3, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                temp1_h = __msa_ceqi_h(diff1, 0);
+                adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                             (v16u8)temp0_h);
+                adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h,
+                                             (v16u8)temp1_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero,
+                                              (v16u8)temp1_h);
+                ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3,
+                                                     (v16i8)adjust2);
+                ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride);
+                ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+                HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+                abs_diff0 = __msa_add_a_h(diff0, zero);
+                abs_diff1 = __msa_add_a_h(diff1, zero);
+                temp0_h = abs_diff0 < delta_vec;
+                temp1_h = abs_diff1 < delta_vec;
+                abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp0_h);
+                abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp1_h);
+                SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0,
+                     abs_diff_neg1);
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                temp1_h = __msa_clei_s_h(diff1, 0);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                              (v16u8)abs_diff_neg0,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1,
+                                              (v16u8)abs_diff_neg1,
+                                              (v16u8)temp1_h);
+                ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h);
+                ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+                MAXI_SH2_SH(adjust2, adjust3, 0);
+                SAT_UH2_SH(adjust2, adjust3, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                temp1_h = __msa_ceqi_h(diff1, 0);
+                adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                             (v16u8)temp0_h);
+                adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h,
+                                             (v16u8)temp1_h);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero,
+                                             (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero,
+                                             (v16u8)temp1_h);
+                ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3,
+                                                     (v16i8)adjust2);
+                ST_UB(running_avg_y, running_avg_y_ptr);
+                running_avg_y_ptr += avg_y_stride;
+            }
+
+            col_sum2 = __msa_min_s_h(col_sum2, val_127);
+            col_sum3 = __msa_min_s_h(col_sum3, val_127);
+            temp0_h = col_sum2 + col_sum3;
+            temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+            temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+            temp1_d = __msa_splati_d(temp0_d, 1);
+            temp0_d += (v2i64)temp1_d;
+            sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+            if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+            {
+                return COPY_BLOCK;
+            }
+        }
+        else
+        {
+            return COPY_BLOCK;
+        }
+    }
+
+    LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6,
+           src7);
+    sig_start += (8 * sig_stride);
+    LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13,
+           src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start,
+           avg_y_stride);
+    running_avg_y_start += (8 * avg_y_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           running_avg_y_start, avg_y_stride);
+
+    return FILTER_BLOCK;
+}
+
+int32_t vp8_denoiser_filter_uv_msa(uint8_t *mc_running_avg_y_ptr,
+                                   int32_t mc_avg_y_stride,
+                                   uint8_t *running_avg_y_ptr,
+                                   int32_t avg_y_stride,
+                                   uint8_t *sig_ptr,
+                                   int32_t sig_stride,
+                                   uint32_t motion_magnitude,
+                                   int32_t increase_denoising)
+{
+    uint8_t *running_avg_y_start = running_avg_y_ptr;
+    uint8_t *sig_start = sig_ptr;
+    int32_t cnt = 0;
+    int32_t sum_diff = 0;
+    int32_t shift_inc1 = 3;
+    int32_t delta = 0;
+    int32_t sum_block = 0;
+    int32_t sum_diff_thresh;
+    int64_t dst0, dst1, src0, src1, src2, src3;
+    v16u8 mc_running_avg_y0, running_avg_y, sig0;
+    v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+    v16u8 sig2, sig3, sig4, sig5, sig6, sig7;
+    v16u8 coeff0;
+    v8i16 diff0, abs_diff0, abs_diff_neg0;
+    v8i16 adjust0, adjust2;
+    v8i16 shift_inc1_vec = { 0 };
+    v8i16 col_sum0 = { 0 };
+    v8i16 temp0_h, temp2_h, cmp, delta_vec;
+    v4i32 temp0_w;
+    v2i64 temp0_d, temp1_d;
+    v16i8 zero = { 0 };
+    v8i16 one = __msa_ldi_h(1);
+    v8i16 four = __msa_ldi_h(4);
+    v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+
+    sig0 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+    sig1 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+    sig2 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2);
+    sig3 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3);
+    sig4 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4);
+    sig5 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5);
+    sig6 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6);
+    sig7 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7);
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_block = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 8;
+
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV)
+    {
+        return COPY_BLOCK;
+    }
+
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+        adj_val = __msa_add_a_h(adj_val, one);
+
+        if (increase_denoising)
+        {
+            adj_val = __msa_add_a_h(adj_val, one);
+            shift_inc1 = 4;
+        }
+
+        temp0_h = (v8i16)zero - adj_val;
+        adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+    }
+
+    adj_val = __msa_insert_h(adj_val, 3, cnt);
+    adj_val = __msa_insert_h(adj_val, 7, cnt);
+    shift_inc1_vec = __msa_fill_h(shift_inc1);
+    for (cnt = 4; cnt--;)
+    {
+        v8i16 mask0 = { 0 };
+        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+        sig0 = LD_UB(sig_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+        sig1 = LD_UB(sig_ptr);
+        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
+        diff0 = __msa_hsub_u_h(coeff0, coeff0);
+        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        col_sum0 += adjust0;
+        temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+        temp0_h += adjust0;
+        temp0_h = __msa_maxi_s_h(temp0_h, 0);
+        temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0,
+                                     (v16u8)temp2_h);
+        dst0 = __msa_copy_s_d((v2i64)running_avg_y,  0);
+        SD(dst0, running_avg_y_ptr);
+        running_avg_y_ptr += avg_y_stride;
+
+        mask0 = __msa_ldi_h(0);
+        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
+        diff0 = __msa_hsub_u_h(coeff0, coeff0);
+        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        col_sum0 += adjust0;
+        temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+        temp0_h += adjust0;
+        temp0_h = __msa_maxi_s_h(temp0_h, 0);
+        temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1,
+                                     (v16u8)temp2_h);
+        dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+        SD(dst1, running_avg_y_ptr);
+
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        running_avg_y_ptr += avg_y_stride;
+    }
+
+    temp0_h = col_sum0;
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 8;
+    mc_running_avg_y_ptr -= mc_avg_y_stride * 8;
+    running_avg_y_ptr -= avg_y_stride * 8;
+    sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+
+    if (increase_denoising)
+    {
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    }
+
+    if (abs(sum_diff) > sum_diff_thresh)
+    {
+        delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+        delta_vec = __msa_fill_h(delta);
+        if (delta < 4)
+        {
+            for (cnt = 4; cnt--;)
+            {
+                running_avg_y = LD_UB(running_avg_y_ptr);
+                mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+                sig0 = LD_UB(sig_ptr);
+                /* Update pointers for next iteration. */
+                sig_ptr += sig_stride;
+                mc_running_avg_y_ptr += mc_avg_y_stride;
+                running_avg_y_ptr += avg_y_stride;
+
+                mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+                sig1 = LD_UB(sig_ptr);
+                running_avg_y1 = LD_UB(running_avg_y_ptr);
+
+                coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0,
+                                             (v16i8)sig0);
+                diff0 = __msa_hsub_u_h(coeff0, coeff0);
+                abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+                temp0_h = delta_vec < abs_diff0;
+                abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                                (v16u8)delta_vec,
+                                                (v16u8)temp0_h);
+                abs_diff_neg0 = (v8i16)zero - abs_diff0;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                             (v16u8)abs_diff_neg0,
+                                             (v16u8)temp0_h);
+                temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y);
+                adjust2 = temp2_h + adjust0;
+                adjust2 = __msa_maxi_s_h(adjust2, 0);
+                adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                              (v16u8)temp0_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                col_sum0 += adjust0;
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2,
+                                                     (v16i8)adjust2);
+                dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+                SD(dst0, running_avg_y_ptr - avg_y_stride);
+
+                coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1,
+                                             (v16i8)sig1);
+                diff0 = __msa_hsub_u_h(coeff0, coeff0);
+                abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+                temp0_h = delta_vec < abs_diff0;
+                abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                                (v16u8)delta_vec,
+                                                (v16u8)temp0_h);
+                abs_diff_neg0 = (v8i16)zero - abs_diff0;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                             (v16u8)abs_diff_neg0,
+                                             (v16u8)temp0_h);
+                temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1);
+                adjust2 = temp2_h + adjust0;
+                adjust2 = __msa_maxi_s_h(adjust2, 0);
+                adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                              (v16u8)temp0_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                col_sum0 += adjust0;
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2,
+                                                     (v16i8)adjust2);
+                dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+                SD(dst1, running_avg_y_ptr);
+                running_avg_y_ptr += avg_y_stride;
+            }
+
+            temp0_h = col_sum0;
+            temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+            temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+            temp1_d = __msa_splati_d(temp0_d, 1);
+            temp0_d += temp1_d;
+            sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+
+            if (abs(sum_diff) > sum_diff_thresh)
+            {
+                return COPY_BLOCK;
+            }
+        }
+        else
+        {
+            return COPY_BLOCK;
+        }
+    }
+
+    LD4(sig_start, sig_stride, src0, src1, src2, src3);
+    sig_start += (4 * sig_stride);
+    SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+    running_avg_y_start += (4 * avg_y_stride);
+
+    LD4(sig_start, sig_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+
+    return FILTER_BLOCK;
+}

diff --git a/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c b/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c
new file mode 100644
index 0000000..ea794a8
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c

@@ -0,0 +1,174 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr)
+{
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, dq_coeff, coeff0, coeff1;
+    v4i32 diff0, diff1;
+    v2i64 err0 = { 0 };
+    v2i64 err1 = { 0 };
+
+    for (loop_cnt = 2; loop_cnt--;)
+    {
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+    }
+
+    err0 += __msa_splati_d(err0, 1);
+    err1 += __msa_splati_d(err1, 1);
+    err = __msa_copy_s_d(err0, 0);
+    err += __msa_copy_s_d(err1, 0);
+
+    return err;
+}
+
+int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc)
+{
+    BLOCK *be;
+    BLOCKD *bd;
+    int16_t *coeff_ptr, *dq_coeff_ptr;
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+    v4i32 diff0, diff1;
+    v2i64 err0, err1;
+    v16u8 zero  = { 0 };
+    v16u8 mask0 = (v16u8)__msa_ldi_b(255);
+
+    if (1 == dc)
+    {
+        mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero);
+    }
+
+    for (loop_cnt = 0; loop_cnt < 8; loop_cnt++)
+    {
+        be = &mb->block[2 * loop_cnt];
+        bd = &mb->e_mbd.block[2 * loop_cnt];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff2 = LD_SH(coeff_ptr);
+        dq_coeff2 = LD_SH(dq_coeff_ptr);
+        be = &mb->block[2 * loop_cnt + 1];
+        bd = &mb->e_mbd.block[2 * loop_cnt + 1];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff3 = LD_SH(coeff_ptr);
+        dq_coeff3 = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff4 = LD_SH(coeff_ptr);
+        dq_coeff4 = LD_SH(dq_coeff_ptr);
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err0 += __msa_splati_d(err0, 1);
+        err1 += __msa_splati_d(err1, 1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+
+        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err0 += __msa_splati_d(err0, 1);
+        err1 += __msa_splati_d(err1, 1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+    }
+
+    return err;
+}
+
+int32_t vp8_mbuverror_msa(MACROBLOCK *mb)
+{
+    BLOCK *be;
+    BLOCKD *bd;
+    int16_t *coeff_ptr, *dq_coeff_ptr;
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+    v4i32 diff0, diff1;
+    v2i64 err0, err1, err_dup0, err_dup1;
+
+    for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2)
+    {
+        be = &mb->block[loop_cnt];
+        bd = &mb->e_mbd.block[loop_cnt];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff2 = LD_SH(coeff_ptr);
+        dq_coeff2 = LD_SH(dq_coeff_ptr);
+        be = &mb->block[loop_cnt + 1];
+        bd = &mb->e_mbd.block[loop_cnt + 1];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff3 = LD_SH(coeff_ptr);
+        dq_coeff3 = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff4 = LD_SH(coeff_ptr);
+        dq_coeff4 = LD_SH(dq_coeff_ptr);
+
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+
+        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err_dup0 = __msa_splati_d(err0, 1);
+        err_dup1 = __msa_splati_d(err1, 1);
+        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+
+        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err_dup0 = __msa_splati_d(err0, 1);
+        err_dup1 = __msa_splati_d(err1, 1);
+        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+    }
+
+    return err;
+}

diff --git a/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/libvpx/vp8/encoder/mips/msa/quantize_msa.c
new file mode 100644
index 0000000..0f97646
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/msa/quantize_msa.c

@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin,
+                                  int16_t *round, int16_t *quant,
+                                  int16_t *de_quant, int16_t *q_coeff,
+                                  int16_t *dq_coeff)
+{
+    int32_t cnt, eob;
+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
+                          3, 8, 11, 13, 9, 10, 14, 15 };
+    v8i16 round0, round1;
+    v8i16 sign_z0, sign_z1;
+    v8i16 q_coeff0, q_coeff1;
+    v8i16 x0, x1, de_quant0, de_quant1;
+    v8i16 coeff0, coeff1, z0, z1;
+    v8i16 quant0, quant1, quant2, quant3;
+    v8i16 zero = { 0 };
+    v8i16 inv_zig_zag0, inv_zig_zag1;
+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+    eob = -1;
+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z0, z1);
+    LD_SH2(round, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               round0, round1);
+    LD_SH2(quant, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    sign_z0 = z0 >> 15;
+    sign_z1 = z1 >> 15;
+    x0 = __msa_add_a_h(z0, zero);
+    x1 = __msa_add_a_h(z1, zero);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+    x0 = x0 ^ sign_z0;
+    x1 = x1 ^ sign_z1;
+    SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
+    VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+    LD_SH2(de_quant, 8, de_quant0, de_quant1);
+    q_coeff0 *= de_quant0;
+    q_coeff1 *= de_quant1;
+    ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
+
+    for (cnt = 0; cnt < 16; ++cnt)
+    {
+        if ((cnt <= 7) && (x1[7 - cnt] != 0))
+        {
+            eob = (15 - cnt);
+            break;
+        }
+
+        if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0))
+        {
+            eob = (7 - (cnt - 8));
+            break;
+        }
+    }
+
+    return (int8_t)(eob + 1);
+}
+
+static int8_t exact_regular_quantize_b_msa(int16_t *zbin_boost,
+                                           int16_t *coeff_ptr,
+                                           int16_t *zbin,
+                                           int16_t *round,
+                                           int16_t *quant,
+                                           int16_t *quant_shift,
+                                           int16_t *de_quant,
+                                           int16_t zbin_oq_in,
+                                           int16_t *q_coeff,
+                                           int16_t *dq_coeff)
+{
+    int32_t cnt, eob;
+    int16_t *boost_temp = zbin_boost;
+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
+                          3, 8, 11, 13, 9, 10, 14, 15 };
+    v8i16 round0, round1;
+    v8i16 sign_z0, sign_z1;
+    v8i16 q_coeff0, q_coeff1;
+    v8i16 z_bin0, z_bin1, zbin_o_q;
+    v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
+    v8i16 coeff0, coeff1, z0, z1;
+    v8i16 quant0, quant1, quant2, quant3;
+    v8i16 zero = { 0 };
+    v8i16 inv_zig_zag0, inv_zig_zag1;
+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+    zbin_o_q = __msa_fill_h(zbin_oq_in);
+    eob = -1;
+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z0, z1);
+    LD_SH2(round, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               round0, round1);
+    LD_SH2(quant, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    LD_SH2(zbin, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z_bin0, z_bin1);
+    sign_z0 = z0 >> 15;
+    sign_z1 = z1 >> 15;
+    x0 = __msa_add_a_h(z0, zero);
+    x1 = __msa_add_a_h(z1, zero);
+    SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+    SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
+    LD_SH2(quant_shift, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ADD2(x0, round0, x1, round1, x0, x1);
+    ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+    sign_x0 = x0 ^ sign_z0;
+    sign_x1 = x1 ^ sign_z1;
+    SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+    for (cnt = 0; cnt < 16; ++cnt)
+    {
+        if (cnt <= 7)
+        {
+            if (boost_temp[0] <= z_bin0[cnt])
+            {
+                if (x0[cnt])
+                {
+                    eob = cnt;
+                    boost_temp = zbin_boost;
+                }
+                else
+                {
+                    boost_temp++;
+                }
+            }
+            else
+            {
+                sign_x0[cnt] = 0;
+                boost_temp++;
+            }
+        }
+        else
+        {
+            if (boost_temp[0] <= z_bin1[cnt - 8])
+            {
+                if (x1[cnt - 8])
+                {
+                    eob = cnt;
+                    boost_temp = zbin_boost;
+                }
+                else
+                {
+                    boost_temp++;
+                }
+            }
+            else
+            {
+                sign_x1[cnt - 8] = 0;
+                boost_temp++;
+            }
+        }
+    }
+
+    VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
+               q_coeff0, q_coeff1);
+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+    LD_SH2(de_quant, 8, de_quant0, de_quant1);
+    MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
+    ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
+
+    return (int8_t)(eob + 1);
+}
+
+void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d)
+{
+    int16_t *coeff_ptr = b->coeff;
+    int16_t *zbin_ptr = b->zbin;
+    int16_t *round_ptr = b->round;
+    int16_t *quant_ptr = b->quant_fast;
+    int16_t *qcoeff_ptr = d->qcoeff;
+    int16_t *dqcoeff_ptr = d->dqcoeff;
+    int16_t *dequant_ptr = d->dequant;
+
+    *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+                                  dequant_ptr, qcoeff_ptr, dqcoeff_ptr);
+}
+
+void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d)
+{
+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+    int16_t *coeff_ptr = b->coeff;
+    int16_t *zbin_ptr = b->zbin;
+    int16_t *round_ptr = b->round;
+    int16_t *quant_ptr = b->quant;
+    int16_t *quant_shift_ptr = b->quant_shift;
+    int16_t *qcoeff_ptr = d->qcoeff;
+    int16_t *dqcoeff_ptr = d->dqcoeff;
+    int16_t *dequant_ptr = d->dequant;
+    int16_t zbin_oq_value = b->zbin_extra;
+
+    *d->eob = exact_regular_quantize_b_msa(zbin_boost_ptr, coeff_ptr,
+                                           zbin_ptr, round_ptr,
+                                           quant_ptr, quant_shift_ptr,
+                                           dequant_ptr, zbin_oq_value,
+                                           qcoeff_ptr, dqcoeff_ptr);
+}

diff --git a/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c b/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000..5cca5e0
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c

@@ -0,0 +1,303 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frame2_ptr,
+                                             int32_t strength_in,
+                                             int32_t filter_wt_in,
+                                             uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
+    v16u8 frame_l, frame_h;
+    v16i8 zero = { 0 };
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16, filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 8; row--;)
+    {
+        frame1_0_b = LD_SB(frame1_ptr);
+        frame2_0_b = LD_SB(frame2_ptr);
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+        frame1_1_b = LD_SB(frame1_ptr);
+        frame2_1_b = LD_SB(frame2_ptr);
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+    }
+}
+
+static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frame2_ptr,
+                                            int32_t strength_in,
+                                            int32_t filter_wt_in,
+                                            uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
+    v16i8 frame1 = { 0 };
+    v16i8 frame2 = { 0 };
+    v16i8 frame3 = { 0 };
+    v16i8 frame4 = { 0 };
+    v16u8 frame_l, frame_h;
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16;
+    v4i32 filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 2; row--;)
+    {
+        LD2(frame1_ptr, stride, f0, f1);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f2, f3);
+        frame2_ptr += 16;
+        LD2(frame1_ptr, stride, f4, f5);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f6, f7);
+        frame2_ptr += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        INSERT_D2_SB(f0, f1, frame1);
+        INSERT_D2_SB(f2, f3, frame2);
+        INSERT_D2_SB(f4, f5, frame3);
+        INSERT_D2_SB(f6, f7, frame4);
+        ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+    }
+}
+
+void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
+                                   uint8_t *frame2, uint32_t block_size,
+                                   int32_t strength,  int32_t filter_weight,
+                                   uint32_t *accumulator, uint16_t *count)
+{
+    if (8 == block_size)
+    {
+        temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
+                                        filter_weight, accumulator, count);
+    }
+    else if (16 == block_size)
+    {
+        temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
+                                         filter_weight, accumulator, count);
+    }
+    else
+    {
+        uint32_t i, j, k;
+        int32_t modifier;
+        int32_t byte = 0;
+        const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+        for (i = 0, k = 0; i < block_size; ++i)
+        {
+            for (j = 0; j < block_size; ++j, ++k)
+            {
+                int src_byte = frame1[byte];
+                int pixel_value = *frame2++;
+
+                modifier = src_byte - pixel_value;
+                modifier *= modifier;
+                modifier *= 3;
+                modifier += rounding;
+                modifier >>= strength;
+
+                if (modifier > 16)
+                    modifier = 16;
+
+                modifier = 16 - modifier;
+                modifier *= filter_weight;
+
+                count[k] += modifier;
+                accumulator[k] += modifier * pixel_value;
+
+                byte++;
+            }
+
+            byte += stride - block_size;
+        }
+    }
+}

diff --git a/libvpx/vp8/encoder/modecosts.c b/libvpx/vp8/encoder/modecosts.c
index c61563c..ad0e930 100644
--- a/libvpx/vp8/encoder/modecosts.c
+++ b/libvpx/vp8/encoder/modecosts.c

@@ -10,6 +10,7 @@
 
 
 #include "vp8/common/blockd.h"
+#include "modecosts.h"
 #include "onyx_int.h"
 #include "treewriter.h"
 #include "vp8/common/entropymode.h"

diff --git a/libvpx/vp8/encoder/modecosts.h b/libvpx/vp8/encoder/modecosts.h
index 9281551..9871bff 100644
--- a/libvpx/vp8/encoder/modecosts.h
+++ b/libvpx/vp8/encoder/modecosts.h

@@ -16,7 +16,9 @@
 extern "C" {
 #endif
 
-void vp8_init_mode_costs(VP8_COMP *x);
+struct VP8_COMP;
+
+void vp8_init_mode_costs(struct VP8_COMP *x);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index 7140f2f..5e05c8c 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c

@@ -11,11 +11,13 @@
 
 #include "vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
@@ -72,26 +74,7 @@
 
 #if CONFIG_INTERNAL_STATS
 #include "math.h"
-
-extern double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-);
-
-
-extern double vp8_calc_ssimg
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    double *ssim_y,
-    double *ssim_u,
-    double *ssim_v
-);
-
-
+#include "vpx_dsp/ssim.h"
 #endif
 
 
@@ -427,10 +410,10 @@
 
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
-    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
     set_default_lf_deltas(cpi);
 
@@ -507,7 +490,7 @@
 static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
 {
     /* Copy in the new segmentation map */
-    vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+    memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
 
     /* Signal that the map should be updated. */
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
@@ -529,7 +512,7 @@
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
 {
     cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
-    vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+    memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
 }
 
 
@@ -579,11 +562,32 @@
 
     cpi->cyclic_refresh_q = Q / 2;
 
+    if (cpi->oxcf.screen_content_mode) {
+      // Modify quality ramp-up based on Q. Above some Q level, increase the
+      // number of blocks to be refreshed, and reduce it below the thredhold.
+      // Turn-off under certain conditions (i.e., away from key frame, and if
+      // we are at good quality (low Q) and most of the blocks were skipped-encoded
+      // in previous frame.
+      int qp_thresh = (cpi->oxcf.screen_content_mode == 2) ? 80 : 100;
+      if (Q >= qp_thresh) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+      } else if (cpi->frames_since_key > 250 &&
+                 Q < 20 &&
+                 cpi->mb.skip_true_count > (int)(0.95 * mbs_in_frame)) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe = 0;
+      } else {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+      }
+      block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    }
+
     // Set every macroblock to be eligible for update.
     // For key frame this will reset seg map to 0.
-    vpx_memset(cpi->segmentation_map, 0, mbs_in_frame);
+    memset(cpi->segmentation_map, 0, mbs_in_frame);
 
-    if (cpi->common.frame_type != KEY_FRAME)
+    if (cpi->common.frame_type != KEY_FRAME && block_count > 0)
     {
         /* Cycle through the macro_block rows */
         /* MB loop to set local segmentation map */
@@ -615,19 +619,24 @@
         cpi->cyclic_refresh_mode_index = i;
 
 #if CONFIG_TEMPORAL_DENOISING
-        if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
-            Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
-          // Under aggressive denoising mode, use segmentation to turn off loop
-          // filter below some qp thresh. The loop filter is turned off for all
-          // blocks that have been encoded as ZEROMV LAST x frames in a row,
-          // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
-          // This is to avoid "dot" artifacts that can occur from repeated
-          // loop filtering on noisy input source.
-          cpi->cyclic_refresh_q = Q;
-          lf_adjustment = -MAX_LOOP_FILTER;
-          for (i = 0; i < mbs_in_frame; ++i) {
-            seg_map[i] = (cpi->consec_zero_last[i] >
-                          cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+        if (cpi->oxcf.noise_sensitivity > 0) {
+          if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
+              Q < (int)cpi->denoiser.denoise_pars.qp_thresh &&
+              (cpi->frames_since_key >
+               2 * cpi->denoiser.denoise_pars.consec_zerolast)) {
+            // Under aggressive denoising, use segmentation to turn off loop
+            // filter below some qp thresh. The filter is reduced for all
+            // blocks that have been encoded as ZEROMV LAST x frames in a row,
+            // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+            // This is to avoid "dot" artifacts that can occur from repeated
+            // loop filtering on noisy input source.
+            cpi->cyclic_refresh_q = Q;
+            // lf_adjustment = -MAX_LOOP_FILTER;
+            lf_adjustment = -40;
+            for (i = 0; i < mbs_in_frame; ++i) {
+              seg_map[i] = (cpi->consec_zero_last[i] >
+                            cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+            }
           }
         }
 #endif
@@ -660,8 +669,8 @@
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
 
-    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
     /* Test of ref frame deltas */
     cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
@@ -784,6 +793,7 @@
     }
 
     cpi->mb.mbs_tested_so_far = 0;
+    cpi->mb.mbs_zero_last_dot_suppress = 0;
 
     /* best quality defaults */
     sf->RD = 1;
@@ -851,6 +861,25 @@
     sf->thresh_mult[THR_SPLIT2] =
     sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
 
+    // Special case for temporal layers.
+    // Reduce the thresholds for zero/nearest/near for GOLDEN, if GOLDEN is
+    // used as second reference. We don't modify thresholds for ALTREF case
+    // since ALTREF is usually used as long-term reference in temporal layers.
+    if ((cpi->Speed <= 6) &&
+        (cpi->oxcf.number_of_layers > 1) &&
+        (cpi->ref_frame_flags & VP8_LAST_FRAME) &&
+        (cpi->ref_frame_flags & VP8_GOLD_FRAME)) {
+      if (cpi->closest_reference_frame == GOLDEN_FRAME) {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 3;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 3;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 3;
+      } else {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 1;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 1;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 1;
+      }
+    }
+
     cpi->mode_check_freq[THR_ZERO1] =
     cpi->mode_check_freq[THR_NEAREST1] =
     cpi->mode_check_freq[THR_NEAR1] =
@@ -1041,7 +1070,7 @@
         if (Speed >= 15)
             sf->half_pixel_search = 0;
 
-        vpx_memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
+        memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
     }; /* switch */
 
@@ -1081,12 +1110,10 @@
     if (cpi->sf.improved_quant)
     {
         cpi->mb.quantize_b      = vp8_regular_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
     }
     else
     {
         cpi->mb.quantize_b      = vp8_fast_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
     }
     if (cpi->sf.improved_quant != last_improved_quant)
         vp8cx_init_quantizer(cpi);
@@ -1254,7 +1281,7 @@
     CHECK_MEM_ERROR(cpi->active_map,
                     vpx_calloc(cm->mb_rows * cm->mb_cols,
                     sizeof(*cpi->active_map)));
-    vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+    memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
     if (width < 640)
@@ -1361,20 +1388,31 @@
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
-    /* frame rate is not available on the first frame, as it's derived from
+    /* Frame rate is not available on the first frame, as it's derived from
      * the observed timestamps. The actual value used here doesn't matter
-     * too much, as it will adapt quickly. If the reciprocal of the timebase
-     * seems like a reasonable framerate, then use that as a guess, otherwise
-     * use 30.
+     * too much, as it will adapt quickly.
      */
-    cpi->framerate = (double)(oxcf->timebase.den) /
-                     (double)(oxcf->timebase.num);
+    if (oxcf->timebase.num > 0) {
+      cpi->framerate = (double)(oxcf->timebase.den) /
+                       (double)(oxcf->timebase.num);
+    } else {
+      cpi->framerate = 30;
+    }
 
+    /* If the reciprocal of the timebase seems like a reasonable framerate,
+     * then use that as a guess, otherwise use 30.
+     */
     if (cpi->framerate > 180)
         cpi->framerate = 30;
 
     cpi->ref_framerate = cpi->framerate;
 
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
     /* change includes all joint functionality */
     vp8_change_config(cpi, oxcf);
 
@@ -1595,12 +1633,6 @@
     cpi->baseline_gf_interval =
         cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
-    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
-
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     cpi->oxcf.token_partitions = 3;
 #endif
@@ -1703,13 +1735,25 @@
     if (cpi->oxcf.number_of_layers != prev_number_of_layers)
     {
         // If the number of temporal layers are changed we must start at the
-        // base of the pattern cycle, so reset temporal_pattern_counter.
+        // base of the pattern cycle, so set the layer id to 0 and reset
+        // the temporal pattern counter.
+        if (cpi->temporal_layer_id > 0) {
+          cpi->temporal_layer_id = 0;
+        }
         cpi->temporal_pattern_counter = 0;
         reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
     }
 
+    if (!cpi->initial_width)
+    {
+        cpi->initial_width = cpi->oxcf.Width;
+        cpi->initial_height = cpi->oxcf.Height;
+    }
+
     cm->Width       = cpi->oxcf.Width;
     cm->Height      = cpi->oxcf.Height;
+    assert(cm->Width <= cpi->initial_width);
+    assert(cm->Height <= cpi->initial_height);
 
     /* TODO(jkoleszar): if an internal spatial resampling is active,
      * and we downsize the input image, maybe we should clear the
@@ -1830,7 +1874,7 @@
 
     cm = &cpi->common;
 
-    vpx_memset(cpi, 0, sizeof(VP8_COMP));
+    memset(cpi, 0, sizeof(VP8_COMP));
 
     if (setjmp(cm->error.jmp))
     {
@@ -1850,6 +1894,7 @@
     memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
     cpi->common.current_video_frame   = 0;
     cpi->temporal_pattern_counter     = 0;
+    cpi->temporal_layer_id            = -1;
     cpi->kf_overspend_bits            = 0;
     cpi->kf_bitrate_adjustment        = 0;
     cpi->frames_till_gf_update_due      = 0;
@@ -1902,6 +1947,8 @@
     }
 #endif
 
+    cpi->mse_source_denoised = 0;
+
     /* Should we use the cyclic refresh method.
      * Currently this is tied to error resilliant mode
      */
@@ -1925,7 +1972,9 @@
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
     CHECK_MEM_ERROR(cpi->consec_zero_last,
-                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+                    vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
+    CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+                    vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
 #ifdef VP8_ENTROPY_STATS
     init_context_counters();
@@ -1944,6 +1993,8 @@
     cpi->source_alt_ref_active = 0;
     cpi->common.refresh_alt_ref_frame = 0;
 
+    cpi->force_maxqp = 0;
+
     cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
     cpi->b_calculate_ssimg = 0;
@@ -2060,55 +2111,55 @@
     }
 #endif
 
-    cpi->fn_ptr[BLOCK_16X16].sdf            = vp8_sad16x16;
-    cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
-    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vp8_sad16x16x3;
-    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vp8_sad16x16x8;
-    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vp8_sad16x16x4d;
+    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf            = vpx_sub_pixel_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vpx_variance_halfpixvar16x16_h;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vpx_variance_halfpixvar16x16_v;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
 
-    cpi->fn_ptr[BLOCK_16X8].sdf            = vp8_sad16x8;
-    cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
-    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf            = vpx_sub_pixel_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vp8_sad16x8x3;
-    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vp8_sad16x8x8;
-    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vp8_sad16x8x4d;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vpx_sad16x8x3;
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vpx_sad16x8x8;
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;
 
-    cpi->fn_ptr[BLOCK_8X16].sdf            = vp8_sad8x16;
-    cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
-    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf            = vpx_sub_pixel_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vp8_sad8x16x3;
-    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vp8_sad8x16x8;
-    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vp8_sad8x16x4d;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vpx_sad8x16x3;
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vpx_sad8x16x8;
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;
 
-    cpi->fn_ptr[BLOCK_8X8].sdf            = vp8_sad8x8;
-    cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
-    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf            = vpx_sub_pixel_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vp8_sad8x8x3;
-    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vp8_sad8x8x8;
-    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vp8_sad8x8x4d;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vpx_sad8x8x3;
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vpx_sad8x8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;
 
-    cpi->fn_ptr[BLOCK_4X4].sdf            = vp8_sad4x4;
-    cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
-    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf            = vpx_sub_pixel_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vp8_sad4x4x3;
-    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vp8_sad4x4x8;
-    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vp8_sad4x4x4d;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vpx_sad4x4x3;
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vpx_sad4x4x8;
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vpx_sad4x4x4d;
 
 #if ARCH_X86 || ARCH_X86_64
     cpi->fn_ptr[BLOCK_16X16].copymem      = vp8_copy32xn;
@@ -2204,9 +2255,6 @@
 
             if (cpi->b_calculate_psnr)
             {
-                YV12_BUFFER_CONFIG *lst_yv12 =
-                              &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-
                 if (cpi->oxcf.number_of_layers > 1)
                 {
                     int i;
@@ -2218,7 +2266,7 @@
                         double dr = (double)cpi->bytes_in_layer[i] *
                                               8.0 / 1000.0  / time_encoded;
                         double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
-                                         lst_yv12->y_width * lst_yv12->y_height;
+                                         cpi->common.Width * cpi->common.Height;
                         double total_psnr =
                             vpx_sse_to_psnr(samples, 255.0,
                                             cpi->total_error2[i]);
@@ -2240,7 +2288,7 @@
                 else
                 {
                     double samples = 3.0 / 2 * cpi->count *
-                                        lst_yv12->y_width * lst_yv12->y_height;
+                                     cpi->common.Width * cpi->common.Height;
                     double total_psnr = vpx_sse_to_psnr(samples, 255.0,
                                                         cpi->total_sq_error);
                     double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
@@ -2448,6 +2496,7 @@
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
     vpx_free(cpi->consec_zero_last);
+    vpx_free(cpi->consec_zero_last_mvbias);
 
     vp8_remove_common(&cpi->common);
     vpx_free(cpi);
@@ -2490,7 +2539,7 @@
         {
             unsigned int sse;
 
-            vp8_mse16x16(orig + col, orig_stride,
+            vpx_mse16x16(orig + col, orig_stride,
                                             recon + col, recon_stride,
                                             &sse);
             total_sse += sse;
@@ -2803,7 +2852,7 @@
     }
 
     /* Update data structure that monitors level of reference to last GF */
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
     /* this frame refreshes means next frames don't unless specified by user */
@@ -2852,7 +2901,7 @@
         }
 
         /* Update data structure that monitors level of reference to last GF */
-        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
         /* this frame refreshes means next frames don't unless specified by
@@ -3150,10 +3199,8 @@
 
         cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
         cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
         cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
     }
     else    /* For non key frames */
     {
@@ -3165,9 +3212,7 @@
             cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
             cm->alt_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
             cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
         }
         else if (cm->copy_buffer_to_arf)
         {
@@ -3181,10 +3226,8 @@
                     yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->lst_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[ALTREF_FRAME] =
                         cpi->current_ref_frames[LAST_FRAME];
-#endif
                 }
             }
             else /* if (cm->copy_buffer_to_arf == 2) */
@@ -3195,10 +3238,8 @@
                     yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->gld_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[ALTREF_FRAME] =
                         cpi->current_ref_frames[GOLDEN_FRAME];
-#endif
                 }
             }
         }
@@ -3211,9 +3252,7 @@
             cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
             cm->gld_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
             cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
-#endif
         }
         else if (cm->copy_buffer_to_gf)
         {
@@ -3227,10 +3266,8 @@
                     yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->lst_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[GOLDEN_FRAME] =
                         cpi->current_ref_frames[LAST_FRAME];
-#endif
                 }
             }
             else /* if (cm->copy_buffer_to_gf == 2) */
@@ -3241,10 +3278,8 @@
                     yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->alt_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[GOLDEN_FRAME] =
                         cpi->current_ref_frames[ALTREF_FRAME];
-#endif
                 }
             }
         }
@@ -3256,9 +3291,7 @@
         cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
         cm->lst_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
         cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
-#endif
     }
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -3299,16 +3332,189 @@
                         &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
             }
         }
+        if (cpi->oxcf.noise_sensitivity == 4)
+          vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source);
 
     }
 #endif
 
 }
 
+static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
+                                       YV12_BUFFER_CONFIG *dest,
+                                       VP8_COMP *cpi)
+    {
+        int i, j;
+        int Total = 0;
+        int num_blocks = 0;
+        int skip = 2;
+        int min_consec_zero_last = 10;
+        int tot_num_blocks = (source->y_height * source->y_width) >> 8;
+        unsigned char *src = source->y_buffer;
+        unsigned char *dst = dest->y_buffer;
+
+        /* Loop through the Y plane, every |skip| blocks along rows and colmumns,
+         * summing the square differences, and only for blocks that have been
+         * zero_last mode at least |x| frames in a row.
+         */
+        for (i = 0; i < source->y_height; i += 16 * skip)
+        {
+            int block_index_row = (i >> 4) * cpi->common.mb_cols;
+            for (j = 0; j < source->y_width; j += 16 * skip)
+            {
+                int index = block_index_row + (j >> 4);
+                if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+                  unsigned int sse;
+                  Total += vpx_mse16x16(src + j,
+                                        source->y_stride,
+                                        dst + j, dest->y_stride,
+                                        &sse);
+                  num_blocks++;
+                }
+            }
+            src += 16 * skip * source->y_stride;
+            dst += 16 * skip * dest->y_stride;
+        }
+        // Only return non-zero if we have at least ~1/16 samples for estimate.
+        if (num_blocks > (tot_num_blocks >> 4)) {
+        return (Total / num_blocks);
+        } else {
+          return 0;
+        }
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+static void process_denoiser_mode_change(VP8_COMP *cpi) {
+  const VP8_COMMON *const cm = &cpi->common;
+  int i, j;
+  int total = 0;
+  int num_blocks = 0;
+  // Number of blocks skipped along row/column in computing the
+  // nmse (normalized mean square error) of source.
+  int skip = 2;
+  // Only select blocks for computing nmse that have been encoded
+  // as ZERO LAST min_consec_zero_last frames in a row.
+  // Scale with number of temporal layers.
+  int min_consec_zero_last = 12 / cpi->oxcf.number_of_layers;
+  // Decision is tested for changing the denoising mode every
+  // num_mode_change times this function is called. Note that this
+  // function called every 8 frames, so (8 * num_mode_change) is number
+  // of frames where denoising mode change is tested for switch.
+  int num_mode_change = 20;
+  // Framerate factor, to compensate for larger mse at lower framerates.
+  // Use ref_framerate, which is full source framerate for temporal layers.
+  // TODO(marpan): Adjust this factor.
+  int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100;
+  int tot_num_blocks = cm->mb_rows * cm->mb_cols;
+  int ystride = cpi->Source->y_stride;
+  unsigned char *src = cpi->Source->y_buffer;
+  unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer;
+  static const unsigned char const_source[16] = {
+      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128, 128, 128};
+  int bandwidth = (int)(cpi->target_bandwidth);
+  // For temporal layers, use full bandwidth (top layer).
+  if (cpi->oxcf.number_of_layers > 1) {
+    LAYER_CONTEXT *lc = &cpi->layer_context[cpi->oxcf.number_of_layers - 1];
+    bandwidth = (int)(lc->target_bandwidth);
+  }
+  // Loop through the Y plane, every skip blocks along rows and columns,
+  // summing the normalized mean square error, only for blocks that have
+  // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
+  // a row and have small sum difference between current and previous frame.
+  // Normalization here is by the contrast of the current frame block.
+  for (i = 0; i < cm->Height; i += 16 * skip) {
+    int block_index_row = (i >> 4) * cm->mb_cols;
+    for (j = 0; j < cm->Width; j += 16 * skip) {
+      int index = block_index_row + (j >> 4);
+      if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+        unsigned int sse;
+        const unsigned int var = vpx_variance16x16(src + j,
+                                                   ystride,
+                                                   dst + j,
+                                                   ystride,
+                                                   &sse);
+        // Only consider this block as valid for noise measurement
+        // if the sum_diff average of the current and previous frame
+        // is small (to avoid effects from lighting change).
+        if ((sse - var) < 128) {
+          unsigned int sse2;
+          const unsigned int act = vpx_variance16x16(src + j,
+                                                     ystride,
+                                                     const_source,
+                                                     0,
+                                                     &sse2);
+          if (act > 0)
+            total += sse / act;
+          num_blocks++;
+        }
+      }
+    }
+    src += 16 * skip * ystride;
+    dst += 16 * skip * ystride;
+  }
+  total = total * fac_framerate / 100;
+
+  // Only consider this frame as valid sample if we have computed nmse over
+  // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the
+  // application inputs duplicate frames, or contrast is all zero).
+  if (total > 0 &&
+      (num_blocks > (tot_num_blocks >> 4))) {
+    // Update the recursive mean square source_diff.
+    total = (total << 8) / num_blocks;
+    if (cpi->denoiser.nmse_source_diff_count == 0) {
+      // First sample in new interval.
+      cpi->denoiser.nmse_source_diff = total;
+      cpi->denoiser.qp_avg = cm->base_qindex;
+    } else {
+      // For subsequent samples, use average with weight ~1/4 for new sample.
+      cpi->denoiser.nmse_source_diff = (int)((total +
+          3 * cpi->denoiser.nmse_source_diff) >> 2);
+      cpi->denoiser.qp_avg = (int)((cm->base_qindex +
+          3 * cpi->denoiser.qp_avg) >> 2);
+    }
+    cpi->denoiser.nmse_source_diff_count++;
+  }
+  // Check for changing the denoiser mode, when we have obtained #samples =
+  // num_mode_change. Condition the change also on the bitrate and QP.
+  if (cpi->denoiser.nmse_source_diff_count == num_mode_change) {
+    // Check for going up: from normal to aggressive mode.
+    if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) &&
+        (cpi->denoiser.nmse_source_diff >
+        cpi->denoiser.threshold_aggressive_mode) &&
+        (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
+         bandwidth > cpi->denoiser.bitrate_threshold)) {
+      vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
+    } else {
+      // Check for going down: from aggressive to normal mode.
+      if (((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.nmse_source_diff <
+          cpi->denoiser.threshold_aggressive_mode)) ||
+          ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
+           bandwidth < cpi->denoiser.bitrate_threshold))) {
+        vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+      }
+    }
+    // Reset metric and counter for next interval.
+    cpi->denoiser.nmse_source_diff = 0;
+    cpi->denoiser.qp_avg = 0;
+    cpi->denoiser.nmse_source_diff_count = 0;
+  }
+}
+#endif
+
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
 {
     const FRAME_TYPE frame_type = cm->frame_type;
 
+    int update_any_ref_buffers = 1;
+    if (cpi->common.refresh_last_frame == 0 &&
+        cpi->common.refresh_golden_frame == 0 &&
+        cpi->common.refresh_alt_ref_frame == 0) {
+        update_any_ref_buffers = 0;
+    }
+
     if (cm->no_lpf)
     {
         cm->filter_level = 0;
@@ -3320,11 +3526,36 @@
         vp8_clear_system_state();
 
         vpx_usec_timer_start(&timer);
-        if (cpi->sf.auto_filter == 0)
+        if (cpi->sf.auto_filter == 0) {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level_fast(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+            }
+#else
             vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-
-        else
+#endif
+        } else {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level(cpi->Source, cpi);
+            }
+#else
             vp8cx_pick_filter_level(cpi->Source, cpi);
+#endif
+        }
+
 
         if (cm->filter_level > 0)
         {
@@ -3340,7 +3571,9 @@
         sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
 #endif
 
-    if (cm->filter_level > 0)
+    // No need to apply loop-filter if the encoded frame does not update
+    // any reference buffers.
+    if (cm->filter_level > 0 && update_any_ref_buffers)
     {
         vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
     }
@@ -3461,39 +3694,109 @@
     {
         /* Key frame from VFW/auto-keyframe/first frame */
         cm->frame_type = KEY_FRAME;
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity == 4) {
+          // For adaptive mode, reset denoiser to normal mode on key frame.
+          vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+        }
+#endif
     }
 
 #if CONFIG_MULTI_RES_ENCODING
-    /* In multi-resolution encoding, frame_type is decided by lowest-resolution
-     * encoder. Same frame_type is adopted while encoding at other resolution.
-     */
-    if (cpi->oxcf.mr_encoder_id)
-    {
-        LOWER_RES_FRAME_INFO* low_res_frame_info
-                        = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+    if (cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO* low_res_frame_info
+         = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
 
+      if (cpi->oxcf.mr_encoder_id) {
+
+        // TODO(marpan): This constraint shouldn't be needed, as we would like
+        // to allow for key frame setting (forced or periodic) defined per
+        // spatial layer. For now, keep this in.
         cm->frame_type = low_res_frame_info->frame_type;
 
+        // Check if lower resolution is available for motion vector reuse.
         if(cm->frame_type != KEY_FRAME)
         {
-            cpi->mr_low_res_mv_avail = 1;
-            cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+          cpi->mr_low_res_mv_avail = 1;
+          cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
 
-            if (cpi->ref_frame_flags & VP8_LAST_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+          if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
 
-            if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+          if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
 
-            if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+          // Don't use altref to determine whether low res is available.
+          // TODO (marpan): Should we make this type of condition on a
+          // per-reference frame basis?
+          /*
+          if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+          */
         }
+      }
+
+      // On a key frame: For the lowest resolution, keep track of the key frame
+      // counter value. For the higher resolutions, reset the current video
+      // frame counter to that of the lowest resolution.
+      // This is done to the handle the case where we may stop/start encoding
+      // higher layer(s). The restart-encoding of higher layer is only signaled
+      // by a key frame for now.
+      // TODO (marpan): Add flag to indicate restart-encoding of higher layer.
+      if (cm->frame_type == KEY_FRAME) {
+        if (cpi->oxcf.mr_encoder_id) {
+          // If the initial starting value of the buffer level is zero (this can
+          // happen because we may have not started encoding this higher stream),
+          // then reset it to non-zero value based on |starting_buffer_level|.
+          if (cpi->common.current_video_frame == 0 && cpi->buffer_level == 0) {
+            unsigned int i;
+            cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+            cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+            for (i = 0; i < cpi->oxcf.number_of_layers; i++) {
+              LAYER_CONTEXT *lc = &cpi->layer_context[i];
+              lc->bits_off_target = lc->starting_buffer_level;
+              lc->buffer_level = lc->starting_buffer_level;
+            }
+          }
+          cpi->common.current_video_frame =
+              low_res_frame_info->key_frame_counter_value;
+        } else {
+          low_res_frame_info->key_frame_counter_value =
+              cpi->common.current_video_frame;
+        }
+      }
+
     }
 #endif
 
+    // Find the reference frame closest to the current frame.
+    cpi->closest_reference_frame = LAST_FRAME;
+    if(cm->frame_type != KEY_FRAME) {
+      int i;
+      MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
+      if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+        closest_ref = LAST_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) {
+        closest_ref = GOLDEN_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
+        closest_ref = ALTREF_FRAME;
+      }
+      for(i = 1; i <= 3; i++) {
+        vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
+            ((i == 3) ? 4 : i);
+        if (cpi->ref_frame_flags & ref_frame_type) {
+          if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
+              (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+            closest_ref = i;
+          }
+        }
+      }
+      cpi->closest_reference_frame = closest_ref;
+    }
+
     /* Set various flags etc to special state if it is a key frame */
     if (cm->frame_type == KEY_FRAME)
     {
@@ -3512,7 +3815,9 @@
         }
 
         // Reset the zero_last counter to 0 on key frame.
-        vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+        memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+        memset(cpi->consec_zero_last_mvbias, 0,
+               (cpi->common.mb_rows * cpi->common.mb_cols));
     }
 
 #if 0
@@ -3863,7 +4168,10 @@
     */
     if (cpi->cyclic_refresh_mode_enabled)
     {
-      if (cpi->current_layer==0)
+      // Special case for screen_content_mode with golden frame updates.
+      int disable_cr_gf = (cpi->oxcf.screen_content_mode == 2 &&
+                           cm->refresh_golden_frame);
+      if (cpi->current_layer == 0 && cpi->force_maxqp == 0 && !disable_cr_gf)
         cyclic_background_refresh(cpi, Q, 0);
       else
         disable_segmentation(cpi);
@@ -3885,6 +4193,17 @@
 
     scale_and_extend_source(cpi->un_scaled_source, cpi);
 
+#if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
+    // Option to apply spatial blur under the aggressive or adaptive
+    // (temporal denoising) mode.
+    if (cpi->oxcf.noise_sensitivity >= 3) {
+      if (cpi->denoiser.denoise_pars.spatial_blur != 0) {
+        vp8_de_noise(cm, cpi->Source, cpi->Source,
+            cpi->denoiser.denoise_pars.spatial_blur, 1, 0, 0);
+      }
+    }
+#endif
+
 #if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
 
     if (cpi->oxcf.noise_sensitivity > 0)
@@ -3917,11 +4236,11 @@
 
         if (cm->frame_type == KEY_FRAME)
         {
-            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0);
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0, 1);
         }
         else
         {
-            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0);
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0, 1);
 
             src = cpi->Source->y_buffer;
 
@@ -4030,8 +4349,10 @@
                 else
                   disable_segmentation(cpi);
               }
-              // Reset the consec_zero_last counter on key frame.
-              vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+              // Reset the zero_last counter to 0 on key frame.
+              memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+              memset(cpi->consec_zero_last_mvbias, 0,
+                     (cpi->common.mb_rows * cpi->common.mb_cols));
               vp8_set_quantizer(cpi, Q);
             }
 
@@ -4054,7 +4375,7 @@
             if (cm->refresh_entropy_probs == 0)
             {
                 /* save a copy for later refresh */
-                vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+                memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
             }
 
             vp8_update_coef_context(cpi);
@@ -4072,6 +4393,11 @@
         /* transform / motion compensation build reconstruction frame */
         vp8_encode_frame(cpi);
 
+        if (cpi->oxcf.screen_content_mode == 2) {
+          if (vp8_drop_encodedframe_overshoot(cpi, Q))
+            return;
+        }
+
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 #endif
@@ -4420,7 +4746,8 @@
             {
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
                 {
-                    if(tmp->mbmi.mode == ZEROMV)
+                    if (tmp->mbmi.mode == ZEROMV &&
+                       tmp->mbmi.ref_frame == LAST_FRAME)
                         cpi->zeromv_count++;
                     tmp++;
                 }
@@ -4462,6 +4789,38 @@
 
     cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
+#if CONFIG_TEMPORAL_DENOISING
+    // Get some measure of the amount of noise, by measuring the (partial) mse
+    // between source and denoised buffer, for y channel. Partial refers to
+    // computing the sse for a sub-sample of the frame (i.e., skip x blocks along row/column),
+    // and only for blocks in that set that are consecutive ZEROMV_LAST mode.
+    // Do this every ~8 frames, to further reduce complexity.
+    // TODO(marpan): Keep this for now for the case cpi->oxcf.noise_sensitivity < 4,
+    // should be removed in favor of the process_denoiser_mode_change() function below.
+    if (cpi->oxcf.noise_sensitivity > 0 &&
+       cpi->oxcf.noise_sensitivity < 4 &&
+       !cpi->oxcf.screen_content_mode &&
+       cpi->frames_since_key%8 == 0 &&
+       cm->frame_type != KEY_FRAME) {
+       cpi->mse_source_denoised = measure_square_diff_partial(
+           &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi->Source, cpi);
+    }
+
+    // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
+    // of source diff (between current and previous frame), and determine if we
+    // should switch the denoiser mode. Sampling refers to computing the mse for
+    // a sub-sample of the frame (i.e., skip x blocks along row/column), and
+    // only for blocks in that set that have used ZEROMV LAST, along with some
+    // constraint on the sum diff between blocks. This process is called every
+    // ~8 frames, to further reduce complexity.
+    if (cpi->oxcf.noise_sensitivity == 4 &&
+        !cpi->oxcf.screen_content_mode &&
+        cpi->frames_since_key % 8 == 0 &&
+        cm->frame_type != KEY_FRAME) {
+      process_denoiser_mode_change(cpi);
+    }
+#endif
+
 #if CONFIG_MULTITHREAD
     if (cpi->b_multi_threaded)
     {
@@ -4593,6 +4952,13 @@
     if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
         cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
 
+    // If the frame dropper is not enabled, don't let the buffer level go below
+    // some threshold, given here by -|maximum_buffer_size|. For now we only do
+    // this for screen content input.
+    if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+        cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size)
+        cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
+
     /* Rolling monitors of whether we are over or underspending used to
      * help regulate min and Max Q in two pass.
      */
@@ -5067,7 +5433,26 @@
 
                 cpi->ref_framerate = 10000000.0 / avg_duration;
             }
-
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_total_resolutions > 1) {
+              LOWER_RES_FRAME_INFO* low_res_frame_info = (LOWER_RES_FRAME_INFO*)
+                  cpi->oxcf.mr_low_res_mode_info;
+              // Frame rate should be the same for all spatial layers in
+              // multi-res-encoding (simulcast), so we constrain the frame for
+              // higher layers to be that of lowest resolution. This is needed
+              // as he application may decide to skip encoding a high layer and
+              // then start again, in which case a big jump in time-stamps will
+              // be received for that high layer, which will yield an incorrect
+              // frame rate (from time-stamp adjustment in above calculation).
+              if (cpi->oxcf.mr_encoder_id) {
+                 cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+              }
+              else {
+                // Keep track of frame rate for lowest resolution.
+                low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+              }
+            }
+#endif
             if (cpi->oxcf.number_of_layers > 1)
             {
                 unsigned int i;
@@ -5097,8 +5482,12 @@
         update_layer_contexts (cpi);
 
         /* Restore layer specific context & set frame rate */
-        layer = cpi->oxcf.layer_id[
-                cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+        if (cpi->temporal_layer_id >= 0) {
+          layer = cpi->temporal_layer_id;
+        } else {
+          layer = cpi->oxcf.layer_id[
+                  cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+        }
         restore_layer_context (cpi, layer);
         vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
     }
@@ -5217,19 +5606,19 @@
 
     if (cm->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+        memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
     }
 
     /* Save the contexts separately for alt ref, gold and last. */
     /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
     if(cm->refresh_alt_ref_frame)
-        vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
 
     if(cm->refresh_golden_frame)
-        vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
 
     if(cm->refresh_last_frame)
-        vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
     /* if its a dropped frame honor the requests on subsequent frames */
     if (*size > 0)
@@ -5274,19 +5663,23 @@
                 double frame_psnr;
                 YV12_BUFFER_CONFIG      *orig = cpi->Source;
                 YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-                int y_samples = orig->y_height * orig->y_width ;
-                int uv_samples = orig->uv_height * orig->uv_width ;
+                unsigned int y_width = cpi->common.Width;
+                unsigned int y_height = cpi->common.Height;
+                unsigned int uv_width = (y_width + 1) / 2;
+                unsigned int uv_height = (y_height + 1) / 2;
+                int y_samples = y_height * y_width;
+                int uv_samples = uv_height * uv_width;
                 int t_samples = y_samples + 2 * uv_samples;
                 double sq_error;
 
                 ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
+                  recon->y_buffer, recon->y_stride, y_width, y_height);
 
                 ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+                  recon->u_buffer, recon->uv_stride, uv_width, uv_height);
 
                 ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+                  recon->v_buffer, recon->uv_stride, uv_width, uv_height);
 
                 sq_error = (double)(ye + ue + ve);
 
@@ -5308,13 +5701,13 @@
                     vp8_clear_system_state();
 
                     ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height);
+                      pp->y_buffer, pp->y_stride, y_width, y_height);
 
                     ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+                      pp->u_buffer, pp->uv_stride, uv_width, uv_height);
 
                     ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+                      pp->v_buffer, pp->uv_stride, uv_width, uv_height);
 
                     sq_error2 = (double)(ye + ue + ve);
 
@@ -5329,8 +5722,8 @@
                     cpi->total_sq_error2 += sq_error2;
                     cpi->totalp  += frame_psnr2;
 
-                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
-                      &cm->post_proc_buffer, 1, &weight);
+                    frame_ssim2 = vpx_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, &weight);
 
                     cpi->summed_quality += frame_ssim2 * weight;
                     cpi->summed_weights += weight;
@@ -5360,7 +5753,7 @@
             if (cpi->b_calculate_ssimg)
             {
                 double y, u, v, frame_all;
-                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
+                frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show,
                     &y, &u, &v);
 
                 if (cpi->oxcf.number_of_layers > 1)
@@ -5441,6 +5834,7 @@
         cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
+        (void)flags;
 
         if (cpi->common.frame_to_show)
         {
@@ -5533,7 +5927,7 @@
     {
         if (map)
         {
-            vpx_memcpy(cpi->active_map, map, rows * cols);
+            memcpy(cpi->active_map, map, rows * cols);
             cpi->active_map_enabled = 1;
         }
         else
@@ -5580,7 +5974,8 @@
         for (j = 0; j < source->y_width; j += 16)
         {
             unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+            Total += vpx_mse16x16(src + j, source->y_stride,
+                                  dst + j, dest->y_stride, &sse);
         }
 
         src += 16 * source->y_stride;

diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index 7a8baca..8beba27 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h

@@ -18,9 +18,9 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodemb.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "vp8/common/entropy.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/mem.h"
@@ -513,10 +513,20 @@
     signed char *cyclic_refresh_map;
     // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
     unsigned char *consec_zero_last;
+    // Counter that is reset when a block is checked for a mode-bias against
+    // ZEROMV_LASTREF.
+    unsigned char *consec_zero_last_mvbias;
 
     // Frame counter for the temporal pattern. Counter is rest when the temporal
     // layers are changed dynamically (run-time change).
     unsigned int temporal_pattern_counter;
+    // Temporal layer id.
+    int temporal_layer_id;
+
+    // Measure of average squared difference between source and denoised signal.
+    int mse_source_denoised;
+
+    int force_maxqp;
 
 #if CONFIG_MULTITHREAD
     /* multithread data */
@@ -657,6 +667,9 @@
 
     int droppable;
 
+    int initial_width;
+    int initial_height;
+
 #if CONFIG_TEMPORAL_DENOISING
     VP8_DENOISER denoiser;
 #endif
@@ -684,9 +697,11 @@
     int    mr_low_res_mb_cols;
     /* Indicate if lower-res mv info is available */
     unsigned char  mr_low_res_mv_avail;
+#endif
     /* The frame number of each reference frames */
     unsigned int current_ref_frames[MAX_REF_FRAMES];
-#endif
+    // Closest reference frame to current frame.
+    MV_REFERENCE_FRAME closest_reference_frame;
 
     struct rd_costs_struct
     {
@@ -701,6 +716,11 @@
     } rd_costs;
 } VP8_COMP;
 
+void vp8_alloc_compressor_data(VP8_COMP *cpi);
+int vp8_reverse_trans(int x);
+void vp8_new_framerate(VP8_COMP *cpi, double framerate);
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
 void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
                         unsigned char *dest_end, unsigned long *size);
 

diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
index d0ad721..5ce98ad 100644
--- a/libvpx/vp8/encoder/pickinter.c
+++ b/libvpx/vp8/encoder/pickinter.c

@@ -11,6 +11,7 @@
 
 #include <limits.h>
 #include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
@@ -21,7 +22,7 @@
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@@ -29,8 +30,6 @@
 #include "denoising.h"
 #endif
 
-extern int VP8_UVSSE(MACROBLOCK *x);
-
 #ifdef SPEEDSTATS
 extern unsigned int cnt_pm;
 #endif
@@ -38,7 +37,133 @@
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
-extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+// Fixed point implementation of a skin color classifier. Skin color
+// is model by a Gaussian distribution in the CbCr color space.
+// See ../../test/skin_color_detector_test.cc where the reference
+// skin color classifier is defined.
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr)
+{
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+static int macroblock_corner_grad(unsigned char* signal, int stride,
+                                  int offsetx, int offsety, int sgnx, int sgny)
+{
+  int y1 = signal[offsetx * stride + offsety];
+  int y2 = signal[offsetx * stride + offsety + sgny];
+  int y3 = signal[(offsetx + sgnx) * stride + offsety];
+  int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny];
+  return MAX(MAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
+}
+
+static int check_dot_artifact_candidate(VP8_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        unsigned char *target_last,
+                                        int stride,
+                                        unsigned char* last_ref,
+                                        int mb_row,
+                                        int mb_col,
+                                        int channel)
+{
+  int threshold1 = 6;
+  int threshold2 = 3;
+  unsigned int max_num = (cpi->common.MBs) / 10;
+  int grad_last = 0;
+  int grad_source = 0;
+  int index = mb_row * cpi->common.mb_cols + mb_col;
+  // Threshold for #consecutive (base layer) frames using zero_last mode.
+  int num_frames = 30;
+  int shift = 15;
+  if (channel > 0) {
+    shift = 7;
+  }
+  if (cpi->oxcf.number_of_layers > 1)
+  {
+    num_frames = 20;
+  }
+  x->zero_last_dot_suppress = 0;
+  // Blocks on base layer frames that have been using ZEROMV_LAST repeatedly
+  // (i.e, at least |x| consecutive frames are candidates for increasing the
+  // rd adjustment for zero_last mode.
+  // Only allow this for at most |max_num| blocks per frame.
+  // Don't allow this for screen content input.
+  if (cpi->current_layer == 0 &&
+      cpi->consec_zero_last_mvbias[index] > num_frames &&
+      x->mbs_zero_last_dot_suppress < max_num &&
+      !cpi->oxcf.screen_content_mode)
+  {
+    // If this block is checked here, label it so we don't check it again until
+    // ~|x| framaes later.
+    x->zero_last_dot_suppress = 1;
+    // Dot artifact is noticeable as strong gradient at corners of macroblock,
+    // for flat areas. As a simple detector for now, we look for a high
+    // corner gradient on last ref, and a smaller gradient on source.
+    // Check 4 corners, return if any satisfy condition.
+    // Top-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, 0, 1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, 0, 1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+       x->mbs_zero_last_dot_suppress++;
+       return 1;
+    }
+    // Top-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, shift, 1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, shift, 1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, 0, -1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, 0, -1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, shift, -1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, shift, -1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    return 0;
+  }
+  return 0;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+static int is_skin_color(int y, int cb, int cr)
+{
+  if (y < 40 || y > 220)
+  {
+    return 0;
+  }
+  return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
 
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
@@ -52,6 +177,7 @@
     (void) ref_mv;
     (void) error_per_bit;
     (void) vfp;
+    (void) mb;
     (void) mvcost;
     (void) distortion;
     (void) sse;
@@ -90,33 +216,6 @@
 
 }
 
-
-unsigned int vp8_get4x4sse_cs_c
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride
-)
-{
-    int distortion = 0;
-    int r, c;
-
-    for (r = 0; r < 4; r++)
-    {
-        for (c = 0; c < 4; c++)
-        {
-            int diff = src_ptr[c] - ref_ptr[c];
-            distortion += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-
-    return distortion;
-}
-
 static int get_prediction_error(BLOCK *be, BLOCKD *b)
 {
     unsigned char *sptr;
@@ -124,7 +223,7 @@
     sptr = (*(be->base_src) + be->src);
     dptr = b->predictor;
 
-    return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+    return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
 
 }
 
@@ -487,6 +586,7 @@
     MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
     int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
     int this_rd;
+    int denoise_aggressive = 0;
     /* Exit early and don't compute the distortion if this macroblock
      * is marked inactive. */
     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -505,16 +605,24 @@
 
     this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
 
-    /* Adjust rd to bias to ZEROMV */
-    if(this_mode == ZEROMV)
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      denoise_aggressive =
+        (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+    }
+#endif
+
+    // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
+    // TODO: We should also add condition on distance of closest to current.
+    if(!cpi->oxcf.screen_content_mode &&
+       this_mode == ZEROMV &&
+       x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+       (denoise_aggressive || (cpi->closest_reference_frame == LAST_FRAME)))
     {
-        /* Bias to ZEROMV on LAST_FRAME reference when it is available. */
-        if ((cpi->ref_frame_flags & VP8_LAST_FRAME &
-            cpi->common.refresh_last_frame)
-            && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME)
+        // No adjustment if block is considered to be skin area.
+        if(x->is_skin)
             rd_adj = 100;
 
-        // rd_adj <= 100
         this_rd = ((int64_t)this_rd) * rd_adj / 100;
     }
 
@@ -595,6 +703,15 @@
 #endif
 
     int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
+
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;
+    int parent_ref_valid = 0;
+#endif
+
     int_mv mvp;
 
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -605,14 +722,56 @@
     unsigned char *plane[4][3];
     int ref_frame_map[4];
     int sign_bias = 0;
+    int dot_artifact_candidate = 0;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    // If the current frame is using LAST as a reference, check for
+    // biasing the mode selection for dot artifacts.
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+      unsigned char* target_y = x->src.y_buffer;
+      unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+      unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+      int stride = x->src.y_stride;
+      int stride_uv = x->block[16].src_stride;
+#if CONFIG_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity) {
+        const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+        target_y =
+            cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+        stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+        if (uv_denoise) {
+          target_u =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+                  recon_uvoffset;
+          target_v =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+                  recon_uvoffset;
+          stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+        }
+      }
+#endif
+      dot_artifact_candidate =
+          check_dot_artifact_candidate(cpi, x, target_y, stride,
+              plane[LAST_FRAME][0], mb_row, mb_col, 0);
+      // If not found in Y channel, check UV channel.
+      if (!dot_artifact_candidate) {
+        dot_artifact_candidate =
+            check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+                plane[LAST_FRAME][1], mb_row, mb_col, 1);
+        if (!dot_artifact_candidate) {
+          dot_artifact_candidate =
+              check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+                  plane[LAST_FRAME][2], mb_row, mb_col, 2);
+        }
+      }
+    }
 
 #if CONFIG_MULTI_RES_ENCODING
-    int dissim = INT_MAX;
-    int parent_ref_frame = 0;
-    int parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
-    int_mv parent_ref_mv;
-    MB_PREDICTION_MODE parent_mode = 0;
-
+    // |parent_ref_valid| will be set here if potentially we can do mv resue for
+    // this higher resol (|cpi->oxcf.mr_encoder_id| > 0) frame.
+    // |parent_ref_valid| may be reset depending on |parent_ref_frame| for
+    // the current macroblock below.
+    parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
     if (parent_ref_valid)
     {
         int parent_ref_flag;
@@ -630,24 +789,51 @@
          * In this event, take the conservative approach of disabling the
          * lower res info for this MB.
          */
+
         parent_ref_flag = 0;
+        // Note availability for mv reuse is only based on last and golden.
         if (parent_ref_frame == LAST_FRAME)
             parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
         else if (parent_ref_frame == GOLDEN_FRAME)
             parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
-        else if (parent_ref_frame == ALTREF_FRAME)
-            parent_ref_flag = (cpi->ref_frame_flags & VP8_ALTR_FRAME);
 
         //assert(!parent_ref_frame || parent_ref_flag);
+
+        // If |parent_ref_frame| did not match either last or golden then
+        // shut off mv reuse.
         if (parent_ref_frame && !parent_ref_flag)
             parent_ref_valid = 0;
+
+        // Don't do mv reuse since we want to allow for another mode besides
+        // ZEROMV_LAST to remove dot artifact.
+        if (dot_artifact_candidate)
+          parent_ref_valid = 0;
+    }
+#endif
+
+    // Check if current macroblock is in skin area.
+    {
+    const int y = x->src.y_buffer[7 * x->src.y_stride + 7];
+    const int cb = x->src.u_buffer[3 * x->src.uv_stride + 3];
+    const int cr = x->src.v_buffer[3 * x->src.uv_stride + 3];
+    x->is_skin = 0;
+    if (!cpi->oxcf.screen_content_mode)
+      x->is_skin = is_skin_color(y, cb, cr);
+    }
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      // Under aggressive denoising mode, should we use skin map to reduce denoiser
+      // and ZEROMV bias? Will need to revisit the accuracy of this detection for
+      // very noisy input. For now keep this as is (i.e., don't turn it off). 
+      // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive)
+      //   x->is_skin = 0;
     }
 #endif
 
     mode_mv = mode_mv_sb[sign_bias];
     best_ref_mv.as_int = 0;
-    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mbmode, 0, sizeof(best_mbmode));
 
     /* Setup search priorities */
 #if CONFIG_MULTI_RES_ENCODING
@@ -678,8 +864,6 @@
         best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
     }
 
-    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-
     /* Count of the number of MBs tested so far this frame */
     x->mbs_tested_so_far++;
 
@@ -689,9 +873,13 @@
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
     /* If the frame has big static background and current MB is in low
-     * motion area, its mode decision is biased to ZEROMV mode.
-     */
-    calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+    *  motion area, its mode decision is biased to ZEROMV mode.
+    *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). 
+    *  At such speed settings, ZEROMV is already heavily favored.
+    */
+    if (cpi->Speed < 12) {
+      calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+    }
 
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
@@ -700,6 +888,13 @@
     }
 #endif
 
+    if (dot_artifact_candidate)
+    {
+        // Bias against ZEROMV_LAST mode.
+        rd_adjustment = 150;
+    }
+
+
     /* if we encode a new mv this is important
      * find the best new motion vector
      */
@@ -816,7 +1011,7 @@
             else
             {
                 rate2 += rate;
-                distortion2 = vp8_variance16x16(
+                distortion2 = vpx_variance16x16(
                                     *(b->base_src), b->src_stride,
                                     x->e_mbd.predictor, 16, &sse);
                 this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@@ -845,7 +1040,7 @@
                                              xd->dst.y_stride,
                                              xd->predictor,
                                              16);
-            distortion2 = vp8_variance16x16
+            distortion2 = vpx_variance16x16
                                           (*(b->base_src), b->src_stride,
                                           x->e_mbd.predictor, 16, &sse);
             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@@ -885,14 +1080,17 @@
             step_param = cpi->sf.first_step + speed_adjust;
 
 #if CONFIG_MULTI_RES_ENCODING
-            /* If lower-res drops this frame, then higher-res encoder does
-               motion search without any previous knowledge. Also, since
-               last frame motion info is not stored, then we can not
+            /* If lower-res frame is not available for mv reuse (because of
+               frame dropping or different temporal layer pattern), then higher
+               resol encoder does motion search without any previous knowledge.
+               Also, since last frame motion info is not stored, then we can not
                use improved_mv_pred. */
-            if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
+            if (cpi->oxcf.mr_encoder_id)
                 sf_improved_mv_pred = 0;
 
-            if (parent_ref_valid && parent_ref_frame)
+            // Only use parent MV as predictor if this candidate reference frame
+            // (|this_ref_frame|) is equal to |parent_ref_frame|.
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame))
             {
                 /* Use parent MV as predictor. Adjust search range
                  * accordingly.
@@ -936,7 +1134,8 @@
             }
 
 #if CONFIG_MULTI_RES_ENCODING
-            if (parent_ref_valid && parent_ref_frame && dissim <= 2 &&
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame) &&
+                dissim <= 2 &&
                 MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
                     abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
             {
@@ -973,10 +1172,12 @@
                  * change the behavior in lowest-resolution encoder.
                  * Will improve it later.
                  */
-                 /* Set step_param to 0 to ensure large-range motion search
-                    when encoder drops this frame at lower-resolution.
-                  */
-                if (!parent_ref_valid)
+                /* Set step_param to 0 to ensure large-range motion search
+                 * when mv reuse if not valid (i.e. |parent_ref_valid| = 0),
+                 * or if this candidate reference frame (|this_ref_frame|) is
+                 * not equal to |parent_ref_frame|.
+                 */
+                if (!parent_ref_valid || (parent_ref_frame != this_ref_frame))
                     step_param = 0;
 #endif
                     bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
@@ -1039,7 +1240,10 @@
             }
 
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
-
+            // The clamp below is not necessary from the perspective
+            // of VP8 bitstream, but is added to improve ChromeCast
+            // mirroring's robustness. Please do not remove.
+            vp8_clamp_mv2(&mode_mv[this_mode], xd);
             /* mv cost; */
             rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
                                      cpi->mb.mvcost, 128);
@@ -1047,7 +1251,6 @@
 
         case NEARESTMV:
         case NEARMV:
-
             if (mode_mv[this_mode].as_int == 0)
                 continue;
 
@@ -1078,18 +1281,24 @@
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->oxcf.noise_sensitivity)
         {
-
             /* Store for later use by denoiser. */
-            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+            // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
+            int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
+                (cpi->common.current_video_frame -
+                 cpi->current_ref_frames[this_ref_frame] >
+                 MAX_GF_ARF_DENOISE_RANGE)) ? 1 : 0;
+            if (this_mode == ZEROMV && sse < zero_mv_sse &&
+                !skip_old_reference)
             {
                 zero_mv_sse = sse;
                 x->best_zeromv_reference_frame =
                         x->e_mbd.mode_info_context->mbmi.ref_frame;
             }
 
-            /* Store the best NEWMV in x for later use in the denoiser. */
+            // Store the best NEWMV in x for later use in the denoiser.
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
-                    sse < best_sse)
+                sse < best_sse && !skip_old_reference)
             {
                 best_sse = sse;
                 x->best_sse_inter_mode = NEWMV;
@@ -1111,8 +1320,8 @@
             *returndistortion = distortion2;
             best_rd_sse = sse;
             best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
-                       sizeof(MB_MODE_INFO));
+            memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                   sizeof(MB_MODE_INFO));
 
             /* Testing this mode gave rise to an improvement in best error
              * score. Lower threshold a bit for next time
@@ -1175,6 +1384,8 @@
     if (cpi->oxcf.noise_sensitivity)
     {
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
+        int reevaluate = 0;
+        int is_noisy = 0;
         if (x->best_sse_inter_mode == DC_PRED)
         {
             /* No best MV found. */
@@ -1184,18 +1395,52 @@
             x->best_reference_frame = best_mbmode.ref_frame;
             best_sse = best_rd_sse;
         }
+        // For non-skin blocks that have selected ZEROMV for this current frame,
+        // and have been selecting ZEROMV_LAST (on the base layer frame) at
+        // least |x~20| consecutive past frames in a row, label the block for
+        // possible increase in denoising strength. We also condition this
+        // labeling on there being significant denoising in the scene
+        if  (cpi->oxcf.noise_sensitivity == 4) {
+          if (cpi->denoiser.nmse_source_diff >
+              70 * cpi->denoiser.threshold_aggressive_mode / 100)
+            is_noisy = 1;
+        } else {
+          if (cpi->mse_source_denoised > 1000)
+            is_noisy = 1;
+        }
         x->increase_denoising = 0;
+        if (!x->is_skin &&
+            x->best_sse_inter_mode == ZEROMV &&
+            (x->best_reference_frame == LAST_FRAME ||
+            x->best_reference_frame == cpi->closest_reference_frame) &&
+            cpi->consec_zero_last[block_index] >= 20 &&
+            is_noisy) {
+            x->increase_denoising = 1;
+        }
+        x->denoise_zeromv = 0;
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
                                 block_index);
 
-        /* Reevaluate ZEROMV after denoising. */
-        if (best_mbmode.ref_frame == INTRA_FRAME &&
+        // Reevaluate ZEROMV after denoising: for large noise content
+        // (i.e., cpi->mse_source_denoised is above threshold), do this for all
+        // blocks that did not pick ZEROMV as best mode but are using ZEROMV
+        // for denoising. Otherwise, always re-evaluate for blocks that picked
+        // INTRA mode as best mode.
+        // Avoid blocks that have been biased against ZERO_LAST
+        // (i.e., dot artifact candidate blocks).
+        reevaluate = (best_mbmode.ref_frame == INTRA_FRAME) ||
+                     (best_mbmode.mode != ZEROMV &&
+                      x->denoise_zeromv &&
+                      cpi->mse_source_denoised > 2000);
+        if (!dot_artifact_candidate &&
+            reevaluate &&
             x->best_zeromv_reference_frame != INTRA_FRAME)
         {
             int this_rd = 0;
             int this_ref_frame = x->best_zeromv_reference_frame;
+            rd_adjustment = 100;
             rate2 = x->ref_frame_cost[this_ref_frame] +
                     vp8_cost_mv_ref(ZEROMV, mdcounts);
             distortion2 = 0;
@@ -1214,8 +1459,8 @@
 
             if (this_rd < best_rd)
             {
-                vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
-                           sizeof(MB_MODE_INFO));
+                memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
             }
         }
 
@@ -1239,8 +1484,8 @@
     /* set to the best mb mode, this copy can be skip if x->skip since it
      * already has the right content */
     if (!x->skip)
-        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
-                   sizeof(MB_MODE_INFO));
+        memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+               sizeof(MB_MODE_INFO));
 
     if (best_mbmode.mode <= B_PRED)
     {
@@ -1255,7 +1500,6 @@
     update_mvcount(x, &best_ref_mv);
 }
 
-
 void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
     int error4x4, error16x16 = INT_MAX;
@@ -1279,7 +1523,7 @@
                                          xd->dst.y_stride,
                                          xd->predictor,
                                          16);
-        distortion = vp8_variance16x16
+        distortion = vpx_variance16x16
             (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
         rate = x->mbmode_cost[xd->frame_type][mode];
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
index 250d04c..debd304 100644
--- a/libvpx/vp8/encoder/picklpf.c
+++ b/libvpx/vp8/encoder/picklpf.c

@@ -9,10 +9,11 @@
  */
 
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/alloccommon.h"
@@ -23,8 +24,8 @@
 
 extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc)
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                    YV12_BUFFER_CONFIG *dst_ybc)
 {
     unsigned char *src_y, *dst_y;
     int yheight;
@@ -49,7 +50,7 @@
     src_y = src_ybc->y_buffer + yoffset;
     dst_y = dst_ybc->y_buffer + yoffset;
 
-    vpx_memcpy(dst_y, src_y, ystride * linestocopy);
+    memcpy(dst_y, src_y, ystride * linestocopy);
 }
 
 static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
@@ -83,7 +84,7 @@
         for (j = 0; j < source->y_width; j += 16)
         {
             unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride,
+            Total += vpx_mse16x16(src + j, source->y_stride,
                                                      dst + j, dest->y_stride,
                                                      &sse);
         }
@@ -142,7 +143,7 @@
     int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
     int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
     int filt_val;
-    int best_filt_val = cm->filter_level;
+    int best_filt_val;
     YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
     /* Replace unfiltered frame buffer with a new one */
@@ -173,7 +174,7 @@
     /* Get the err using the previous frame's filter value. */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+    yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
     vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
     best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
@@ -184,7 +185,7 @@
     while (filt_val >= min_filter_level)
     {
         /* Apply the loop filter */
-        vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+        yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
         vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
         /* Get the err for filtered frame */
@@ -214,7 +215,7 @@
         while (filt_val < max_filter_level)
         {
             /* Apply the loop filter */
-            vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+            yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
 
             vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
@@ -274,8 +275,7 @@
 
     int filter_step;
     int filt_high = 0;
-    /* Start search at previous frame filter level */
-    int filt_mid = cm->filter_level;
+    int filt_mid;
     int filt_low = 0;
     int filt_best;
     int filt_direction = 0;
@@ -287,7 +287,7 @@
 
     YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
-    vpx_memset(ss_err, 0, sizeof(ss_err));
+    memset(ss_err, 0, sizeof(ss_err));
 
     /* Replace unfiltered frame buffer with a new one */
     cm->frame_to_show = &cpi->pick_lf_lvl_frame;

diff --git a/libvpx/vp8/encoder/ppc/csystemdependent.c b/libvpx/vp8/encoder/ppc/csystemdependent.c
deleted file mode 100644
index 63f2357..0000000
--- a/libvpx/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null

@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *coeff, short *dqcoeff);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// ppc
-extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp8_sad16x16_ppc;
-extern SADFunction vp8_sad16x8_ppc;
-extern SADFunction vp8_sad8x16_ppc;
-extern SADFunction vp8_sad8x8_ppc;
-extern SADFunction vp8_sad4x4_ppc;
-
-extern variance_function vp8_variance16x16_ppc;
-extern variance_function vp8_variance8x16_ppc;
-extern variance_function vp8_variance16x8_ppc;
-extern variance_function vp8_variance8x8_ppc;
-extern variance_function vp8_variance4x4_ppc;
-extern variance_function vp8_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-void vp8_cmachine_specific_config(void)
-{
-    // Pure C:
-    vp8_mbuverror               = vp8_mbuverror_c;
-    vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
-    vp8_short_fdct4x4            = vp8_short_fdct4x4_ppc;
-    vp8_short_fdct8x4            = vp8_short_fdct8x4_ppc;
-    vp8_fast_fdct4x4             = vp8_short_fdct4x4_ppc;
-    vp8_fast_fdct8x4             = vp8_short_fdct8x4_ppc;
-    short_walsh4x4               = vp8_short_walsh4x4_c;
-
-    vp8_variance4x4             = vp8_variance4x4_ppc;
-    vp8_variance8x8             = vp8_variance8x8_ppc;
-    vp8_variance8x16            = vp8_variance8x16_ppc;
-    vp8_variance16x8            = vp8_variance16x8_ppc;
-    vp8_variance16x16           = vp8_variance16x16_ppc;
-    vp8_mse16x16                = vp8_mse16x16_ppc;
-
-    vp8_sub_pixel_variance4x4     = vp8_sub_pixel_variance4x4_ppc;
-    vp8_sub_pixel_variance8x8     = vp8_sub_pixel_variance8x8_ppc;
-    vp8_sub_pixel_variance8x16    = vp8_sub_pixel_variance8x16_ppc;
-    vp8_sub_pixel_variance16x8    = vp8_sub_pixel_variance16x8_ppc;
-    vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;
-
-    vp8_get_mb_ss                 = vp8_get_mb_ss_c;
-    vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
-
-    vp8_sad16x16                = vp8_sad16x16_ppc;
-    vp8_sad16x8                 = vp8_sad16x8_ppc;
-    vp8_sad8x16                 = vp8_sad8x16_ppc;
-    vp8_sad8x8                  = vp8_sad8x8_ppc;
-    vp8_sad4x4                  = vp8_sad4x4_ppc;
-
-    vp8_block_error              = vp8_block_error_ppc;
-    vp8_mbblock_error            = vp8_mbblock_error_c;
-
-    vp8_subtract_b               = vp8_subtract_b_c;
-    vp8_subtract_mby             = vp8_subtract_mby_ppc;
-    vp8_subtract_mbuv            = vp8_subtract_mbuv_ppc;
-}

diff --git a/libvpx/vp8/encoder/ppc/encodemb_altivec.asm b/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
deleted file mode 100644
index 6e0099d..0000000
--- a/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null

@@ -1,153 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_subtract_mbuv_ppc
-    .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r9, 256
-    add     r3, r3, r9
-    add     r3, r3, r9
-    add     r6, r6, r9
-
-    li      r10, 16
-    li      r9,  4
-    mtctr   r9
-
-    vspltisw v0, 0
-
-mbu_loop:
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r4, r4, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-
-    add     r4, r4, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbu_loop
-
-    mtctr   r9
-
-mbv_loop:
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r5, r5, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-
-    add     r5, r5, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbv_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-    vspltisw v0, 0
-
-mby_loop:
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r5           ;# pred
-
-    add     r4, r4, r6
-    addi    r5, r5, 16
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vmrglb  v3, v0, v1          ;# unpack low src  to short
-    vmrglb  v4, v0, v2          ;# unpack low pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mby_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/libvpx/vp8/encoder/ppc/fdct_altivec.asm b/libvpx/vp8/encoder/ppc/fdct_altivec.asm
deleted file mode 100644
index 935d0cb..0000000
--- a/libvpx/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null

@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_short_fdct4x4_ppc
-    .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;#   in normalization (fwd is twice unitary, inv is half unitary)
-;#   and that they are of course transposes of each other.
-;#
-;#   The following three accomplish most of implementation and
-;#   are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    li      r6, 16
-
-    load_c v0, dct_tab, 0, r9, r10
-    lvx     v1,   r6, r10
-    addi    r10, r10, 32
-    lvx     v2,    0, r10
-    lvx     v3,   r6, r10
-
-    load_c v4, ppc_dctperm_tab,  0, r9, r10
-    load_c v5, ppc_dctperm_tab, r6, r9, r10
-
-    load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
-;#   For fwd transform, indices are horizontal positions, then frequencies.
-;#   For inverse transform, frequencies then positions.
-;#   The two resulting  A0..A3  B0..B3  are later combined
-;#   and vertically transformed.
-
-.macro two_rows_horiz Dst
-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
-
-    vmsumshm v10, v0, v8, v6
-    vmsumshm v10, v1, v9, v10
-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
-
-    vmsumshm v11, v2, v8, v6
-    vmsumshm v11, v3, v9, v11
-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
-
-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;#   forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v10, v8, v7
-
-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v8, v8, v7
-
-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
-.endm
-
-.macro two_rows_h Dest
-    stw     r0,  0(r8)
-    lwz     r0,  4(r3)
-    stw     r0,  4(r8)
-    lwzux   r0, r3,r5
-    stw     r0,  8(r8)
-    lwz     r0,  4(r3)
-    stw     r0, 12(r8)
-    lvx     v8,  0,r8
-    two_rows_horiz \Dest
-.endm
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8,  r1, 0
-    addi    r10, r3, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    ;# Next block
-    addi    r3, r10, 8
-    addi    r4, r4, 32
-    lvx     v6, 0, r9           ;# v6 = Hround
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .data
-    .align 4
-ppc_dctperm_tab:
-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
-    .align 4
-dct_tab:
-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
-    .align 4
-round_tab:
-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

diff --git a/libvpx/vp8/encoder/ppc/rdopt_altivec.asm b/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
deleted file mode 100644
index ba48230..0000000
--- a/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null

@@ -1,51 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_block_error_ppc
-
-    .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    stw     r5, 12(r1)          ;# tranfer dc to vector register
-
-    lvx     v0, 0, r3           ;# Coeff
-    lvx     v1, 0, r4           ;# dqcoeff
-
-    li      r10, 16
-
-    vspltisw v3, 0
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v2, v0, v0, v3     ;# multiply differences
-
-    lvx     v0, r10, r3         ;# Coeff
-    lvx     v1, r10, r4         ;# dqcoeff
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v1, v0, v0, v2     ;# multiply differences
-    vsumsws v1, v1, v3          ;# sum up
-
-    stvx    v1, 0, r1
-    lwz     r3, 12(r1)          ;# return value
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/libvpx/vp8/encoder/quantize.h b/libvpx/vp8/encoder/quantize.h
index c739b26..7d36c2b 100644
--- a/libvpx/vp8/encoder/quantize.h
+++ b/libvpx/vp8/encoder/quantize.h

@@ -18,6 +18,9 @@
 
 struct VP8_COMP;
 struct macroblock;
+extern void vp8_quantize_mb(struct macroblock *x);
+extern void vp8_quantize_mby(struct macroblock *x);
+extern void vp8_quantize_mbuv(struct macroblock *x);
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
 extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);

diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index c51650c..e8796a1 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c

@@ -296,7 +296,7 @@
 
     vp8_default_coef_probs(& cpi->common);
 
-    vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+    memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
     {
         int flag[2] = {1, 1};
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
@@ -305,9 +305,9 @@
     /* Make sure we initialize separate contexts for altref,gold, and normal.
      * TODO shouldn't need 3 different copies of structure to do this!
      */
-    vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-    vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
-    vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
 
     cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
 
@@ -708,7 +708,13 @@
                     Adjustment = (cpi->this_frame_target - min_frame_target);
 
                 if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
-                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+                {
+                    Adjustment = (cpi->current_gf_interval - 1) * Adjustment;
+                    // Limit adjustment to 10% of current target.
+                    if (Adjustment > (10 * cpi->this_frame_target) / 100)
+                        Adjustment = (10 * cpi->this_frame_target) / 100;
+                    cpi->this_frame_target += Adjustment;
+                }
                 else
                     cpi->this_frame_target -= Adjustment;
             }
@@ -1209,6 +1215,11 @@
 {
     int Q = cpi->active_worst_quality;
 
+    if (cpi->force_maxqp == 1) {
+      cpi->active_worst_quality = cpi->worst_quality;
+      return cpi->worst_quality;
+    }
+
     /* Reset Zbin OQ value */
     cpi->mb.zbin_over_quant = 0;
 
@@ -1553,3 +1564,46 @@
     }
     return 1;
 }
+// If this just encoded frame (mcomp/transform/quant, but before loopfilter and
+// pack_bitstream) has large overshoot, and was not being encoded close to the
+// max QP, then drop this frame and force next frame to be encoded at max QP.
+// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// TODO(marpan): Should do this exit condition during the encode_frame
+// (i.e., halfway during the encoding of the frame) to save cycles.
+int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
+  if (cpi->pass == 0 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->drop_frames_allowed == 0 &&
+      cpi->common.frame_type != KEY_FRAME) {
+    // Note: the "projected_frame_size" from encode_frame() only gives estimate
+    // of mode/motion vector rate (in non-rd mode): so below we only require
+    // that projected_frame_size is somewhat greater than per-frame-bandwidth,
+    // but add additional condition with high threshold on prediction residual.
+
+    // QP threshold: only allow dropping if we are not close to qp_max.
+    int thresh_qp = 3 * cpi->worst_quality >> 2;
+    // Rate threshold, in bytes.
+    int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
+    // Threshold for the average (over all macroblocks) of the pixel-sum
+    // residual error over 16x16 block. Should add QP dependence on threshold?
+    int thresh_pred_err_mb = (256 << 4);
+    int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
+    if (Q < thresh_qp &&
+        cpi->projected_frame_size > thresh_rate &&
+        pred_err_mb > thresh_pred_err_mb) {
+      // Drop this frame: advance frame counters, and set force_maxqp flag.
+      cpi->common.current_video_frame++;
+      cpi->frames_since_key++;
+      // Flag to indicate we will force next frame to be encoded at max QP.
+      cpi->force_maxqp = 1;
+      return 1;
+    } else {
+      cpi->force_maxqp = 0;
+      return 0;
+    }
+    cpi->force_maxqp = 0;
+    return 0;
+  }
+  cpi->force_maxqp = 0;
+  return 0;
+}

diff --git a/libvpx/vp8/encoder/ratectrl.h b/libvpx/vp8/encoder/ratectrl.h
index 829697f..703de9f 100644
--- a/libvpx/vp8/encoder/ratectrl.h
+++ b/libvpx/vp8/encoder/ratectrl.h

@@ -30,6 +30,8 @@
 /* return of 0 means drop frame */
 extern int vp8_pick_frame_size(VP8_COMP *cpi);
 
+extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 2f6f5d0..fdff378 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c

@@ -15,6 +15,7 @@
 #include <assert.h>
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
@@ -27,8 +28,8 @@
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/quant_common.h"
 #include "encodemb.h"
-#include "quantize.h"
-#include "vp8/common/variance.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@@ -499,17 +500,17 @@
 
     if ((mv_row | mv_col) & 7)
     {
-        vp8_sub_pixel_variance8x8(uptr, pre_stride,
+        vpx_sub_pixel_variance8x8(uptr, pre_stride,
             mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
-        vp8_sub_pixel_variance8x8(vptr, pre_stride,
+        vpx_sub_pixel_variance8x8(vptr, pre_stride,
             mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
         sse2 += sse1;
     }
     else
     {
-        vp8_variance8x8(uptr, pre_stride,
+        vpx_variance8x8(uptr, pre_stride,
             upred_ptr, uv_stride, &sse2);
-        vp8_variance8x8(vptr, pre_stride,
+        vpx_variance8x8(vptr, pre_stride,
             vpred_ptr, uv_stride, &sse1);
         sse2 += sse1;
     }
@@ -555,8 +556,8 @@
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -650,8 +651,8 @@
      * a temp buffer that meets the stride requirements, but we are only
      * interested in the left 4x4 block
      * */
-    DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16*4);
-    DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+    DECLARE_ALIGNED(16, unsigned char,  best_predictor[16*4]);
+    DECLARE_ALIGNED(16, short, best_dqcoeff[16]);
     int dst_stride = x->e_mbd.dst.y_stride;
     unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
 
@@ -691,7 +692,7 @@
             *a = tempa;
             *l = templ;
             copy_predictor(best_predictor, b->predictor);
-            vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+            memcpy(best_dqcoeff, b->dqcoeff, 32);
         }
     }
     b->bmi.as_mode = *best_mode;
@@ -715,8 +716,8 @@
     ENTROPY_CONTEXT *tl;
     const int *bmode_costs;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -820,8 +821,8 @@
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -837,6 +838,9 @@
 static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int fullpixel)
 {
+    (void)cpi;
+    (void)fullpixel;
+
     vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
     vp8_subtract_mbuv(x->src_diff,
         x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -854,6 +858,9 @@
 static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                           int *distortion, int fullpixel)
 {
+    (void)cpi;
+    (void)fullpixel;
+
     vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
     vp8_subtract_mbuv(x->src_diff,
         x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -1122,8 +1129,8 @@
     ENTROPY_CONTEXT *ta_b;
     ENTROPY_CONTEXT *tl_b;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -1166,8 +1173,8 @@
             ENTROPY_CONTEXT *ta_s;
             ENTROPY_CONTEXT *tl_s;
 
-            vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
-            vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+            memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+            memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
 
             ta_s = (ENTROPY_CONTEXT *)&t_above_s;
             tl_s = (ENTROPY_CONTEXT *)&t_left_s;
@@ -1323,14 +1330,14 @@
                 mode_selected = this_mode;
                 best_label_rd = this_rd;
 
-                vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-                vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
 
             }
         } /*for each 4x4 mode*/
 
-        vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
 
         labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                     bsi->ref_mv, x->mvcost);
@@ -1386,7 +1393,7 @@
     int i;
     BEST_SEG_INFO bsi;
 
-    vpx_memset(&bsi, 0, sizeof(bsi));
+    memset(&bsi, 0, sizeof(bsi));
 
     bsi.segment_rd = best_rd;
     bsi.ref_mv = best_ref_mv;
@@ -1655,7 +1662,6 @@
             mv.as_mv.row = mvx[vcnt/2];
             mv.as_mv.col = mvy[vcnt/2];
 
-            find = 1;
             /* sr is set to 0 to allow calling function to decide the search
              * range.
              */
@@ -1685,16 +1691,16 @@
     }else if(xd->mb_to_top_edge==0)
     {   /* only has left MB for sad calculation. */
         near_sad[0] = near_sad[2] = INT_MAX;
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
     }else if(xd->mb_to_left_edge ==0)
     {   /* only has left MB for sad calculation. */
         near_sad[1] = near_sad[2] = INT_MAX;
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
     }else
     {
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
-        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1709,14 +1715,14 @@
         if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
 
         if(near_sad[4] != INT_MAX)
-            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride);
         if(near_sad[5] != INT_MAX)
-            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
-        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride);
         if(near_sad[6] != INT_MAX)
-            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride);
         if(near_sad[7] != INT_MAX)
-            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1778,7 +1784,7 @@
         if(threshold < x->encode_breakout)
             threshold = x->encode_breakout;
 
-        var = vp8_variance16x16
+        var = vpx_variance16x16
                 (*(b->base_src), b->src_stride,
                 x->e_mbd.predictor, 16, &sse);
 
@@ -1920,8 +1926,8 @@
                       (rd->distortion2-rd->distortion_uv));
 
     best_mode->rd = this_rd;
-    vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-    vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+    memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
 
     if ((this_mode == B_PRED) || (this_mode == SPLITMV))
     {
@@ -1983,9 +1989,9 @@
     best_mode.rd = INT_MAX;
     best_mode.yrd = INT_MAX;
     best_mode.intra_rd = INT_MAX;
-    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
-    vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
 
     /* Setup search priorities */
     get_reference_search_order(cpi, ref_frame_map);
@@ -2287,7 +2293,6 @@
                 mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
                 /* Further step/diamond searches as necessary */
-                n = 0;
                 further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
                 n = num00;
@@ -2554,8 +2559,6 @@
                                                intra_rd_penalty, cpi, x);
             if (this_rd < best_mode.rd || x->skip)
             {
-                /* Note index of best mode so far */
-                best_mode_index = mode_index;
                 *returnrate = rd.rate2;
                 *returndistortion = rd.distortion2;
                 update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
@@ -2580,7 +2583,7 @@
 
 
     /* macroblock modes */
-    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+    memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
 
     if (best_mode.mbmode.mode == B_PRED)
     {
@@ -2593,7 +2596,7 @@
         for (i = 0; i < 16; i++)
             xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
 
-        vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+        memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                       x->partition_info->bmi[15].mv.as_int;

diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
index e0da35e..b4fcd10 100644
--- a/libvpx/vp8/encoder/rdopt.h
+++ b/libvpx/vp8/encoder/rdopt.h

@@ -136,6 +136,9 @@
     int near_sadidx[]
 );
 void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
+int VP8_UVSSE(MACROBLOCK *x);
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp8/encoder/segmentation.c b/libvpx/vp8/encoder/segmentation.c
index 37972e2..fdd22fc 100644
--- a/libvpx/vp8/encoder/segmentation.c
+++ b/libvpx/vp8/encoder/segmentation.c

@@ -23,7 +23,7 @@
     if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
     {
         /* Reset Gf useage monitors */
-        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
     }
     else

diff --git a/libvpx/vp8/encoder/ssim.c b/libvpx/vp8/encoder/ssim.c
deleted file mode 100644
index e751608..0000000
--- a/libvpx/vp8/encoder/ssim.c
+++ /dev/null

@@ -1,233 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyx_int.h"
-
-void vp8_ssim_parms_16x16_c
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-)
-{
-    int i,j;
-    for(i=0;i<16;i++,s+=sp,r+=rp)
-     {
-         for(j=0;j<16;j++)
-         {
-             *sum_s += s[j];
-             *sum_r += r[j];
-             *sum_sq_s += s[j] * s[j];
-             *sum_sq_r += r[j] * r[j];
-             *sum_sxr += s[j] * r[j];
-         }
-     }
-}
-void vp8_ssim_parms_8x8_c
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-)
-{
-    int i,j;
-    for(i=0;i<8;i++,s+=sp,r+=rp)
-     {
-         for(j=0;j<8;j++)
-         {
-             *sum_s += s[j];
-             *sum_r += r[j];
-             *sum_sq_s += s[j] * s[j];
-             *sum_sq_r += r[j] * r[j];
-             *sum_sxr += s[j] * r[j];
-         }
-     }
-}
-
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
-
-static double similarity
-(
-    unsigned long sum_s,
-    unsigned long sum_r,
-    unsigned long sum_sq_s,
-    unsigned long sum_sq_r,
-    unsigned long sum_sxr,
-    int count
-)
-{
-    int64_t ssim_n, ssim_d;
-    int64_t c1, c2;
-
-    //scale the constants by number of pixels
-    c1 = (cc1*count*count)>>12;
-    c2 = (cc2*count*count)>>12;
-
-    ssim_n = (2*sum_s*sum_r+ c1)*((int64_t) 2*count*sum_sxr-
-          (int64_t) 2*sum_s*sum_r+c2);
-
-    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
-        ((int64_t)count*sum_sq_s-(int64_t)sum_s*sum_s +
-        (int64_t)count*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
-
-    return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
-}
-static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    vp8_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// TODO: (jbb) tried to scale this function such that we may be able to use it
-// for distortion metric in mode selection code ( provided we do a reconstruction)
-long dssim(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    int64_t ssim3;
-    int64_t ssim_n1,ssim_n2;
-    int64_t ssim_d1,ssim_d2;
-    int64_t ssim_t1,ssim_t2;
-    int64_t c1, c2;
-
-    // normalize by 256/64
-    c1 = cc1*16;
-    c2 = cc2*16;
-
-    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    ssim_n1 = (2*sum_s*sum_r+ c1);
-
-    ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
-
-    ssim_d1 =((int64_t)sum_s*sum_s +(int64_t)sum_r*sum_r+c1);
-
-    ssim_d2 = (256 * (int64_t) sum_sq_s-(int64_t) sum_s*sum_s +
-                    (int64_t) 256*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
-
-    ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
-    ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
-
-    ssim3 = 256 *ssim_t1 * ssim_t2;
-    if(ssim3 <0 )
-        ssim3=0;
-    return (long)( ssim3  );
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp8_ssim2
-(
-    unsigned char *img1,
-    unsigned char *img2,
-    int stride_img1,
-    int stride_img2,
-    int width,
-    int height
-)
-{
-    int i,j;
-    int samples =0;
-    double ssim_total=0;
-
-    // sample point start with each 4x4 location
-    for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
-    {
-        for(j=0; j < width-8; j+=4 )
-        {
-            double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2);
-            ssim_total += v;
-            samples++;
-        }
-    }
-    ssim_total /= samples;
-    return ssim_total;
-}
-double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-)
-{
-    double a, b, c;
-    double ssimv;
-
-    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width,
-                 source->y_height);
-
-    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    ssimv = a * .8 + .1 * (b + c);
-
-    *weight = 1;
-
-    return ssimv;
-}
-
-double vp8_calc_ssimg
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    double *ssim_y,
-    double *ssim_u,
-    double *ssim_v
-)
-{
-    double ssim_all = 0;
-    double a, b, c;
-
-    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width,
-                 source->y_height);
-
-    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-    *ssim_y = a;
-    *ssim_u = b;
-    *ssim_v = c;
-    ssim_all = (a * 4 + b + c) /6;
-
-    return ssim_all;
-}

diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c
index 4dc0d95..85d26c2 100644
--- a/libvpx/vp8/encoder/temporal_filter.c
+++ b/libvpx/vp8/encoder/temporal_filter.c

@@ -12,7 +12,7 @@
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
@@ -163,6 +163,8 @@
     int pre = d->offset;
     int pre_stride = x->e_mbd.pre.y_stride;
 
+    (void)error_thresh;
+
     best_ref_mv1.as_int = 0;
     best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
     best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
@@ -236,12 +238,12 @@
     int mb_rows = cpi->common.mb_rows;
     int mb_y_offset = 0;
     int mb_uv_offset = 0;
-    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED(16, unsigned int, accumulator[16*16 + 8*8 + 8*8]);
+    DECLARE_ALIGNED(16, unsigned short, count[16*16 + 8*8 + 8*8]);
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
     unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED(16, unsigned char,  predictor[16*16 + 8*8 + 8*8]);
 
     /* Save input state */
     unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -272,8 +274,8 @@
             int i, j, k;
             int stride;
 
-            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned short));
+            memset(accumulator, 0, 384*sizeof(unsigned int));
+            memset(count, 0, 384*sizeof(unsigned short));
 
 #if ALT_REF_MC_ENABLED
             cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
@@ -500,7 +502,7 @@
     start_frame = distance + frames_to_blur_forward;
 
     /* Setup frame pointers, NULL indicates frame not included in filter */
-    vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+    memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
     for (frame = 0; frame < frames_to_blur; frame++)
     {
         int which_buffer =  start_frame - frame;

diff --git a/libvpx/vp8/encoder/tokenize.c b/libvpx/vp8/encoder/tokenize.c
index 2dc8205..afd46fb 100644
--- a/libvpx/vp8/encoder/tokenize.c
+++ b/libvpx/vp8/encoder/tokenize.c

@@ -421,7 +421,7 @@
 
 void init_context_counters(void)
 {
-    vpx_memset(context_counters, 0, sizeof(context_counters));
+    memset(context_counters, 0, sizeof(context_counters));
 }
 
 void print_context_counters()
@@ -596,13 +596,13 @@
     /* Clear entropy contexts for Y2 blocks */
     if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
     {
-        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
     }
     else
     {
-        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
-        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
     }
 
 }

diff --git a/libvpx/vp8/encoder/vp8_asm_enc_offsets.c b/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
deleted file mode 100644
index a4169b3..0000000
--- a/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "block.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "tokenize.h"
-
-BEGIN
-
-/* regular quantize */
-DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
-DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
-DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
-DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
-DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
-
-DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
-DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
-
-/* subtract */
-DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
-DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
-DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
-DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
-
-DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));
-
-/* pack tokens */
-DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
-DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
-DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
-DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
-DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
-DEFINE(vp8_writer_buffer_end,                   offsetof(vp8_writer, buffer_end));
-DEFINE(vp8_writer_error,                        offsetof(vp8_writer, error));
-
-DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
-
-DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));
-
-DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
-DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
-
-DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
-
-DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
-DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc ,                            offsetof(VP8_COMP, bc));
-DEFINE(vp8_writer_sz ,                          sizeof(vp8_writer));
-
-DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
-
-DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
-
- * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
- * change they will have to be adjusted.
- */
-
-#if HAVE_EDSP
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
-#endif

diff --git a/libvpx/vp8/encoder/quantize.c b/libvpx/vp8/encoder/vp8_quantize.c
similarity index 69%
rename from libvpx/vp8/encoder/quantize.c
rename to libvpx/vp8/encoder/vp8_quantize.c
index fda997f..ee922c9 100644
--- a/libvpx/vp8/encoder/quantize.c
+++ b/libvpx/vp8/encoder/vp8_quantize.c

@@ -13,60 +13,9 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "onyx_int.h"
-#include "quantize.h"
+#include "vp8/encoder/quantize.h"
 #include "vp8/common/quant_common.h"
 
-#define EXACT_QUANT
-
-#ifdef EXACT_FASTQUANT
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr       = b->coeff;
-    short *zbin_ptr        = b->zbin;
-    short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant_fast;
-    unsigned char *quant_shift_ptr = b->quant_shift;
-    short *qcoeff_ptr      = d->qcoeff;
-    short *dqcoeff_ptr     = d->dqcoeff;
-    short *dequant_ptr     = d->dequant;
-
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-
-    eob = -1;
-
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-        zbin = zbin_ptr[rc] ;
-
-        sz = (z >> 31);                              /* sign of z */
-        x  = (z ^ sz) - sz;                          /* x = abs(z) */
-
-        if (x >= zbin)
-        {
-            x += round_ptr[rc];
-            y  = ((((x * quant_ptr[rc]) >> 16) + x)
-                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
-            x  = (y ^ sz) - sz;                      /* get the sign back */
-            qcoeff_ptr[rc] = x;                      /* write to destination */
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
-
-            if (y)
-            {
-                eob = i;                             /* last nonzero coeffs */
-            }
-        }
-    }
-    *d->eob = (char)(eob + 1);
-}
-
-#else
-
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
@@ -100,9 +49,6 @@
     *d->eob = (char)(eob + 1);
 }
 
-#endif
-
-#ifdef EXACT_QUANT
 void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
@@ -119,8 +65,8 @@
     short *dequant_ptr     = d->dequant;
     short zbin_oq_value    = b->zbin_extra;
 
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
+    memset(qcoeff_ptr, 0, 32);
+    memset(dqcoeff_ptr, 0, 32);
 
     eob = -1;
 
@@ -155,118 +101,7 @@
     *d->eob = (char)(eob + 1);
 }
 
-/* Perform regular quantization, with unbiased rounding and no zero bin. */
-void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i;
-    int rc;
-    int eob;
-    int x;
-    int y;
-    int z;
-    int sz;
-    short *coeff_ptr;
-    short *quant_ptr;
-    short *quant_shift_ptr;
-    short *qcoeff_ptr;
-    short *dqcoeff_ptr;
-    short *dequant_ptr;
-
-    coeff_ptr       = b->coeff;
-    quant_ptr       = b->quant;
-    quant_shift_ptr = b->quant_shift;
-    qcoeff_ptr      = d->qcoeff;
-    dqcoeff_ptr     = d->dqcoeff;
-    dequant_ptr     = d->dequant;
-    eob = - 1;
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-    for (i = 0; i < 16; i++)
-    {
-        int dq;
-        int rounding;
-
-        /*TODO: These arrays should be stored in zig-zag order.*/
-        rc = vp8_default_zig_zag1d[i];
-        z = coeff_ptr[rc];
-        dq = dequant_ptr[rc];
-        rounding = dq >> 1;
-        /* Sign of z. */
-        sz = -(z < 0);
-        x = (z + sz) ^ sz;
-        x += rounding;
-        if (x >= dq)
-        {
-            /* Quantize x. */
-            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;
-            /* Put the sign back. */
-            x = (y + sz) ^ sz;
-            /* Save the coefficient and its dequantized value. */
-            qcoeff_ptr[rc] = x;
-            dqcoeff_ptr[rc] = x * dq;
-            /* Remember the last non-zero coefficient. */
-            if (y)
-                eob = i;
-        }
-    }
-
-    *d->eob = (char)(eob + 1);
-}
-
-#else
-
-void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *zbin_boost_ptr = b->zrun_zbin_boost;
-    short *coeff_ptr      = b->coeff;
-    short *zbin_ptr       = b->zbin;
-    short *round_ptr      = b->round;
-    short *quant_ptr      = b->quant;
-    short *qcoeff_ptr     = d->qcoeff;
-    short *dqcoeff_ptr    = d->dqcoeff;
-    short *dequant_ptr    = d->dequant;
-    short zbin_oq_value   = b->zbin_extra;
-
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-
-    eob = -1;
-
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-
-        zbin_boost_ptr ++;
-        sz = (z >> 31);                              /* sign of z */
-        x  = (z ^ sz) - sz;                          /* x = abs(z) */
-
-        if (x >= zbin)
-        {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
-            x  = (y ^ sz) - sz;                      /* get the sign back */
-            qcoeff_ptr[rc]  = x;                     /* write to destination */
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
-
-            if (y)
-            {
-                eob = i;                             /* last nonzero coeffs */
-                zbin_boost_ptr = &b->zrun_zbin_boost[0]; /* reset zrl */
-            }
-        }
-    }
-
-    *d->eob = (char)(eob + 1);
-}
-
-#endif
-
-void vp8_quantize_mby_c(MACROBLOCK *x)
+void vp8_quantize_mby(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -279,7 +114,7 @@
         x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
 }
 
-void vp8_quantize_mb_c(MACROBLOCK *x)
+void vp8_quantize_mb(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -290,7 +125,7 @@
 }
 
 
-void vp8_quantize_mbuv_c(MACROBLOCK *x)
+void vp8_quantize_mbuv(MACROBLOCK *x)
 {
     int i;
 
@@ -298,23 +133,6 @@
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
 }
 
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_regular_quantize_b(b1, d1);
-    vp8_regular_quantize_b(b2, d2);
-}
-
-void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_fast_quantize_b_c(b1, d1);
-    vp8_fast_quantize_b_c(b2, d2);
-}
-
-
 static const int qrounding_factors[129] =
 {
     48, 48, 48, 48, 48, 48, 48, 48,
@@ -403,8 +221,6 @@
 };
 
 
-#define EXACT_QUANT
-#ifdef EXACT_QUANT
 static void invert_quant(int improved_quant, short *quant,
                          short *shift, short d)
 {
@@ -526,68 +342,6 @@
         }
     }
 }
-#else
-void vp8cx_init_quantizer(VP8_COMP *cpi)
-{
-    int i;
-    int quant_val;
-    int Q;
-
-    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
-
-    for (Q = 0; Q < QINDEX_RANGE; Q++)
-    {
-        /* dc values */
-        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
-        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.Y1dequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        cpi->Y2quant[Q][0] = (1 << 16) / quant_val;
-        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-        cpi->common.Y2dequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        cpi->UVquant[Q][0] = (1 << 16) / quant_val;
-        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
-        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.UVdequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        /* all the ac values = ; */
-        for (i = 1; i < 16; i++)
-        {
-            int rc = vp8_default_zig_zag1d[i];
-
-            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant[Q][rc] = (1 << 16) / quant_val;
-            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant[Q][rc] = (1 << 16) / quant_val;
-            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant[Q][rc] = (1 << 16) / quant_val;
-            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-        }
-    }
-}
-#endif
 
 #define ZBIN_EXTRA_Y \
     (( cpi->common.Y1dequant[QIndex][1] *  \
@@ -781,6 +535,7 @@
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     int update = 0;
     int new_delta_q;
+    int new_uv_delta_q;
     cm->base_qindex = Q;
 
     /* if any of the delta_q values are changing update flag has to be set */
@@ -788,8 +543,6 @@
 
     cm->y1dc_delta_q = 0;
     cm->y2ac_delta_q = 0;
-    cm->uvdc_delta_q = 0;
-    cm->uvac_delta_q = 0;
 
     if (Q < 4)
     {
@@ -801,6 +554,21 @@
     update |= cm->y2dc_delta_q != new_delta_q;
     cm->y2dc_delta_q = new_delta_q;
 
+    new_uv_delta_q = 0;
+    // For screen content, lower the q value for UV channel. For now, select
+    // conservative delta; same delta for dc and ac, and decrease it with lower
+    // Q, and set to 0 below some threshold. May want to condition this in
+    // future on the variance/energy in UV channel.
+    if (cpi->oxcf.screen_content_mode && Q > 40) {
+      new_uv_delta_q = -(int)(0.15 * Q);
+      // Check range: magnitude of delta is 4 bits.
+      if (new_uv_delta_q < -15) {
+        new_uv_delta_q = -15;
+      }
+    }
+    update |= cm->uvdc_delta_q != new_uv_delta_q;
+    cm->uvdc_delta_q = new_uv_delta_q;
+    cm->uvac_delta_q = new_uv_delta_q;
 
     /* Set Segment specific quatizers */
     mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];

diff --git a/libvpx/vp8/encoder/x86/denoising_sse2.c b/libvpx/vp8/encoder/x86/denoising_sse2.c
index 3a4cf7e..101d646 100644
--- a/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/libvpx/vp8/encoder/x86/denoising_sse2.c

@@ -121,12 +121,12 @@
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
         if (abs_sum_diff > sum_diff_thresh) {
           // Before returning to copy the block (i.e., apply no denoising),
-          // checK if we can still apply some (weaker) temporal filtering to
+          // check if we can still apply some (weaker) temporal filtering to
           // this block, that would otherwise not be denoised at all. Simplest
           // is to apply an additional adjustment to running_avg_y to bring it
           // closer to sig. The adjustment is capped by a maximum delta, and
           // chosen such that in most cases the resulting sum_diff will be
-          // within the accceptable range given by sum_diff_thresh.
+          // within the acceptable range given by sum_diff_thresh.
 
           // The delta is set by the excess of absolute pixel diff over the
           // threshold.
@@ -302,12 +302,12 @@
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
         if (abs_sum_diff > sum_diff_thresh) {
           // Before returning to copy the block (i.e., apply no denoising),
-          // checK if we can still apply some (weaker) temporal filtering to
+          // check if we can still apply some (weaker) temporal filtering to
           // this block, that would otherwise not be denoised at all. Simplest
           // is to apply an additional adjustment to running_avg_y to bring it
           // closer to sig. The adjustment is capped by a maximum delta, and
           // chosen such that in most cases the resulting sum_diff will be
-          // within the accceptable range given by sum_diff_thresh.
+          // within the acceptable range given by sum_diff_thresh.
 
           // The delta is set by the excess of absolute pixel diff over the
           // threshold.

diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libvpx/vp8/encoder/x86/quantize_ssse3.c
index 448217f..14282db 100644
--- a/libvpx/vp8/encoder/x86/quantize_ssse3.c
+++ b/libvpx/vp8/encoder/x86/quantize_ssse3.c

@@ -17,7 +17,7 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 static int bsr(int mask) {
-  int eob;
+  unsigned long eob;
   _BitScanReverse(&eob, mask);
   eob++;
   if (mask == 0)

diff --git a/libvpx/vp8/encoder/x86/subtract_mmx.asm b/libvpx/vp8/encoder/x86/subtract_mmx.asm
deleted file mode 100644
index 794dd22..0000000
--- a/libvpx/vp8/encoder/x86/subtract_mmx.asm
+++ /dev/null

@@ -1,223 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp8_subtract_b_mmx_impl) PRIVATE
-sym(vp8_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_mmx) PRIVATE
-sym(vp8_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;src
-    movsxd      rdx,        dword ptr arg(2);src_stride
-    mov         rax,        arg(3)          ;pred
-    push        rbx
-    movsxd      rbx,        dword ptr arg(4);pred_stride
-
-    pxor        mm0,        mm0
-    mov         rcx,        16
-
-
-.submby_loop:
-    movq        mm1,        [rsi]
-    movq        mm3,        [rax]
-
-    movq        mm2,        mm1
-    movq        mm4,        mm3
-
-    punpcklbw   mm1,        mm0
-    punpcklbw   mm3,        mm0
-
-    punpckhbw   mm2,        mm0
-    punpckhbw   mm4,        mm0
-
-    psubw       mm1,        mm3
-    psubw       mm2,        mm4
-
-    movq        [rdi],      mm1
-    movq        [rdi+8],    mm2
-
-    movq        mm1,        [rsi+8]
-    movq        mm3,        [rax+8]
-
-    movq        mm2,        mm1
-    movq        mm4,        mm3
-
-    punpcklbw   mm1,        mm0
-    punpcklbw   mm3,        mm0
-
-    punpckhbw   mm2,        mm0
-    punpckhbw   mm4,        mm0
-
-    psubw       mm1,        mm3
-    psubw       mm2,        mm4
-
-    movq        [rdi+16],   mm1
-    movq        [rdi+24],   mm2
-    add         rdi,        32
-    lea         rax,        [rax+rbx]
-    lea         rsi,        [rsi+rdx]
-    dec         rcx
-    jnz         .submby_loop
-
-    pop rbx
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-
-global sym(vp8_subtract_mbuv_mmx) PRIVATE
-sym(vp8_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;usrc
-    movsxd      rdx,        dword ptr arg(3);src_stride;
-    mov         rax,        arg(4)          ;upred
-    add         rdi,        256*2           ;diff = diff + 256 (shorts)
-    mov         rcx,        8
-    push        rbx
-    movsxd      rbx,        dword ptr arg(6);pred_stride
-
-    pxor        mm7,        mm7
-
-.submbu_loop:
-    movq        mm0,        [rsi]
-    movq        mm1,        [rax]
-    movq        mm3,        mm0
-    movq        mm4,        mm1
-    punpcklbw   mm0,        mm7
-    punpcklbw   mm1,        mm7
-    punpckhbw   mm3,        mm7
-    punpckhbw   mm4,        mm7
-    psubw       mm0,        mm1
-    psubw       mm3,        mm4
-    movq        [rdi],      mm0
-    movq        [rdi+8],    mm3
-    add         rdi, 16
-    add         rsi, rdx
-    add         rax, rbx
-
-    dec         rcx
-    jnz         .submbu_loop
-
-    mov         rsi,        arg(2)          ;vsrc
-    mov         rax,        arg(5)          ;vpred
-    mov         rcx,        8
-
-.submbv_loop:
-    movq        mm0,        [rsi]
-    movq        mm1,        [rax]
-    movq        mm3,        mm0
-    movq        mm4,        mm1
-    punpcklbw   mm0,        mm7
-    punpcklbw   mm1,        mm7
-    punpckhbw   mm3,        mm7
-    punpckhbw   mm4,        mm7
-    psubw       mm0,        mm1
-    psubw       mm3,        mm4
-    movq        [rdi],      mm0
-    movq        [rdi+8],    mm3
-    add         rdi, 16
-    add         rsi, rdx
-    add         rax, rbx
-
-    dec         rcx
-    jnz         .submbv_loop
-
-    pop         rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp8/encoder/x86/subtract_sse2.asm b/libvpx/vp8/encoder/x86/subtract_sse2.asm
deleted file mode 100644
index a5d17f5..0000000
--- a/libvpx/vp8/encoder/x86/subtract_sse2.asm
+++ /dev/null

@@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp8_subtract_b_sse2_impl) PRIVATE
-sym(vp8_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_sse2) PRIVATE
-sym(vp8_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;src
-    movsxd      rdx,        dword ptr arg(2);src_stride
-    mov         rax,        arg(3)          ;pred
-    movdqa      xmm4,       [GLOBAL(t80)]
-    push        rbx
-    mov         rcx,        8               ; do two lines at one time
-    movsxd      rbx,        dword ptr arg(4);pred_stride
-
-.submby_loop:
-    movdqa      xmm0,       [rsi]           ; src
-    movdqa      xmm1,       [rax]           ; pred
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
-
-    movdqa      xmm3,       [rsi + rdx]
-    movdqa      xmm5,       [rax + rbx]
-
-    lea         rsi,        [rsi+rdx*2]
-    lea         rax,        [rax+rbx*2]
-
-    movdqa      [rdi],      xmm0
-    movdqa      [rdi +16],  xmm2
-
-    movdqa      xmm1,       xmm3
-    psubb       xmm3,       xmm5
-
-    pxor        xmm5,       xmm4            ;convert to signed values
-    pxor        xmm1,       xmm4
-    pcmpgtb     xmm5,       xmm1            ; obtain sign information
-
-    movdqa      xmm1,       xmm3
-    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
-    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
-
-    movdqa      [rdi +32],  xmm3
-    movdqa      [rdi +48],  xmm1
-
-    add         rdi,        64
-    dec         rcx
-    jnz         .submby_loop
-
-    pop rbx
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-global sym(vp8_subtract_mbuv_sse2) PRIVATE
-sym(vp8_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-    movdqa      xmm4,       [GLOBAL(t80)]
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;usrc
-    movsxd      rdx,        dword ptr arg(3);src_stride;
-    mov         rax,        arg(4)          ;upred
-    add         rdi,        256*2           ;diff = diff + 256 (shorts)
-    mov         rcx,        4
-    push        rbx
-    movsxd      rbx,        dword ptr arg(6);pred_stride
-
-    ;u
-.submbu_loop:
-    movq        xmm0,       [rsi]           ; src
-    movq        xmm2,       [rsi+rdx]       ; src -- next line
-    movq        xmm1,       [rax]           ; pred
-    movq        xmm3,       [rax+rbx]       ; pred -- next line
-    lea         rsi,        [rsi + rdx*2]
-    lea         rax,        [rax + rbx*2]
-
-    punpcklqdq  xmm0,       xmm2
-    punpcklqdq  xmm1,       xmm3
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1            ; subtraction with sign missed
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    movdqa      xmm3,       xmm1
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
-
-    movdqa      [rdi],      xmm0            ; store difference
-    movdqa      [rdi +16],  xmm2            ; store difference
-    add         rdi,        32
-    sub         rcx, 1
-    jnz         .submbu_loop
-
-    mov         rsi,        arg(2)          ;vsrc
-    mov         rax,        arg(5)          ;vpred
-    mov         rcx,        4
-
-    ;v
-.submbv_loop:
-    movq        xmm0,       [rsi]           ; src
-    movq        xmm2,       [rsi+rdx]       ; src -- next line
-    movq        xmm1,       [rax]           ; pred
-    movq        xmm3,       [rax+rbx]       ; pred -- next line
-    lea         rsi,        [rsi + rdx*2]
-    lea         rax,        [rax + rbx*2]
-
-    punpcklqdq  xmm0,       xmm2
-    punpcklqdq  xmm1,       xmm3
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1            ; subtraction with sign missed
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    movdqa      xmm3,       xmm1
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
-
-    movdqa      [rdi],      xmm0            ; store difference
-    movdqa      [rdi +16],  xmm2            ; store difference
-    add         rdi,        32
-    sub         rcx, 1
-    jnz         .submbv_loop
-
-    pop         rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80

diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
index cf3d8ca..7bf5155 100644
--- a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c

@@ -65,14 +65,3 @@
     return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
 }
 
-void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *z = *(be->base_src) + be->src;
-    unsigned int  src_stride = be->src_stride;
-    short *diff = &be->src_diff[0];
-    unsigned char *predictor = &bd->predictor[0];
-    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}

diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
index 3dfbee3..be9aaf3 100644
--- a/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c

@@ -30,14 +30,3 @@
     return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 
-void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *z = *(be->base_src) + be->src;
-    unsigned int  src_stride = be->src_stride;
-    short *diff = &be->src_diff[0];
-    unsigned char *predictor = &bd->predictor[0];
-    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}

diff --git a/libvpx/vp8/encoder/x86/quantize_sse2.c b/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c
similarity index 97%
rename from libvpx/vp8/encoder/x86/quantize_sse2.c
rename to libvpx/vp8/encoder/x86/vp8_quantize_sse2.c
index 291d219..b4e92e0 100644
--- a/libvpx/vp8/encoder/x86/quantize_sse2.c
+++ b/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c

@@ -35,10 +35,10 @@
 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
     char eob = 0;
-    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *zbin_boost_ptr;
     short *qcoeff_ptr      = d->qcoeff;
-    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
-    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+    DECLARE_ALIGNED(16, short, x[16]);
+    DECLARE_ALIGNED(16, short, y[16]);
 
     __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
     __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
@@ -55,7 +55,7 @@
     __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
     __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
 
-    vpx_memset(qcoeff_ptr, 0, 32);
+    memset(qcoeff_ptr, 0, 32);
 
     /* Duplicate to all lanes. */
     zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);

diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index 6db031f..3ad11c7 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk

@@ -15,6 +15,7 @@
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/copy_c.c
 VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
 VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -52,7 +53,7 @@
 VP8_COMMON_SRCS-yes += common/systemdependent.h
 VP8_COMMON_SRCS-yes += common/threading.h
 VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/loopfilter.c
+VP8_COMMON_SRCS-yes += common/vp8_loopfilter.c
 VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
 VP8_COMMON_SRCS-yes += common/mbpitch.c
 VP8_COMMON_SRCS-yes += common/modecont.c
@@ -60,11 +61,8 @@
 VP8_COMMON_SRCS-yes += common/reconinter.c
 VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
-VP8_COMMON_SRCS-yes += common/sad_c.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/variance_c.c
-VP8_COMMON_SRCS-yes += common/variance.h
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 
@@ -83,28 +81,19 @@
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@@ -119,16 +108,29 @@
 # common (c)
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idctllm_dspr2.c
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/filter_dspr2.c
-VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/loopfilter_filters_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/reconinter_dspr2.c
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idct_blk_dspr2.c
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
 
 # common (c)
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/bilinear_filter_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/reconintra_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
+
+ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
+endif
+
+# common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
 
 # common (media)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
@@ -148,24 +150,6 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
-
-# common (neon)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
@@ -173,12 +157,16 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index d515fc0..fe88cd4 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c

@@ -10,10 +10,13 @@
 
 
 #include "./vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -36,43 +39,33 @@
     vp8e_tuning                 tuning;
     unsigned int                cq_level;         /* constrained quality level */
     unsigned int                rc_max_intra_bitrate_pct;
+    unsigned int                screen_content_mode;
 
 };
 
-struct extraconfig_map
-{
-    int                 usage;
-    struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] =
-{
-    {
-        0,
-        {
-            NULL,
+static struct vp8_extracfg default_extracfg = {
+  NULL,
 #if !(CONFIG_REALTIME_ONLY)
-            0,                          /* cpu_used      */
+  0,                          /* cpu_used      */
 #else
-            4,                          /* cpu_used      */
+  4,                          /* cpu_used      */
 #endif
-            0,                          /* enable_auto_alt_ref */
-            0,                          /* noise_sensitivity */
-            0,                          /* Sharpness */
-            0,                          /* static_thresh */
+  0,                          /* enable_auto_alt_ref */
+  0,                          /* noise_sensitivity */
+  0,                          /* Sharpness */
+  0,                          /* static_thresh */
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
-            VP8_EIGHT_TOKENPARTITION,
+  VP8_EIGHT_TOKENPARTITION,
 #else
-            VP8_ONE_TOKENPARTITION,     /* token_partitions */
+  VP8_ONE_TOKENPARTITION,     /* token_partitions */
 #endif
-            0,                          /* arnr_max_frames */
-            3,                          /* arnr_strength */
-            3,                          /* arnr_type*/
-            0,                          /* tuning*/
-            10,                         /* cq_level */
-            0,                          /* rc_max_intra_bitrate_pct */
-        }
-    }
+  0,                          /* arnr_max_frames */
+  3,                          /* arnr_strength */
+  3,                          /* arnr_type*/
+  0,                          /* tuning*/
+  10,                         /* cq_level */
+  0,                          /* rc_max_intra_bitrate_pct */
+  0,                          /* screen_content_mode */
 };
 
 struct vpx_codec_alg_priv
@@ -90,6 +83,7 @@
     /* pkt_list size depends on the maximum number of lagged frames allowed. */
     vpx_codec_pkt_list_decl(64) pkt_list;
     unsigned int                fixed_kf_cntr;
+    vpx_enc_frame_flags_t   control_frame_flags;
 };
 
 
@@ -141,7 +135,7 @@
     RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
     RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
     RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
-    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+    RANGE_CHECK(cfg, g_timebase.num,        1, 1000000000);
     RANGE_CHECK_HI(cfg, g_profile,          3);
     RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
     RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
@@ -205,6 +199,7 @@
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
     RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    RANGE_CHECK_HI(vp8_cfg, screen_content_mode, 2);
     if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
         RANGE_CHECK(vp8_cfg, cq_level,
                     cfg->rc_min_quantizer, cfg->rc_max_quantizer);
@@ -242,7 +237,8 @@
         RANGE_CHECK_HI(cfg, ts_periodicity, 16);
 
         for (i=1; i<cfg->ts_number_layers; i++)
-            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] && 
+                cfg->rc_target_bitrate > 0)
                 ERROR("ts_target_bitrate entries are not strictly increasing");
 
         RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
@@ -371,9 +367,9 @@
     if (oxcf->number_of_layers > 1)
     {
         memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
-                          sizeof(cfg.ts_target_bitrate));
+                sizeof(cfg.ts_target_bitrate));
         memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
-                          sizeof(cfg.ts_rate_decimator));
+                sizeof(cfg.ts_rate_decimator));
         memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
     }
 
@@ -390,6 +386,8 @@
         oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
         oxcf->mr_low_res_mode_info        = mr_cfg->mr_low_res_mode_info;
     }
+#else
+    (void)mr_cfg;
 #endif
 
     oxcf->cpu_used               = vp8_cfg.cpu_used;
@@ -408,6 +406,8 @@
 
     oxcf->tuning                 = vp8_cfg.tuning;
 
+    oxcf->screen_content_mode    = vp8_cfg.screen_content_mode;
+
     /*
         printf("Current VP8 Settings: \n");
         printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -449,9 +449,14 @@
 {
     vpx_codec_err_t res;
 
-    if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
-        && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
-        ERROR("Cannot change width or height after initialization");
+    if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+    {
+        if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+            ERROR("Cannot change width or height after initialization");
+        if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+            (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+            ERROR("Cannot increase width or height larger than their initial values");
+    }
 
     /* Prevent increasing lag_in_frames. This check is stricter than it needs
      * to be -- the limit is not increasing past the first lag_in_frames
@@ -473,8 +478,6 @@
     return res;
 }
 
-int vp8_reverse_trans(int);
-
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args)
 {
   int *const arg = va_arg(args, int *);
@@ -597,6 +600,15 @@
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
+                                               va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.screen_content_mode =
+      CAST(VP8E_SET_SCREEN_CONTENT_MODE, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                         void **mem_loc)
 {
@@ -623,6 +635,9 @@
         *mem_loc = (void *)shared_mem_loc;
         res = VPX_CODEC_OK;
     }
+#else
+    (void)cfg;
+    (void)mem_loc;
 #endif
     return res;
 }
@@ -631,27 +646,23 @@
                                  vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
 {
     vpx_codec_err_t        res = VPX_CODEC_OK;
-    struct vpx_codec_alg_priv *priv;
-    vpx_codec_enc_cfg_t       *cfg;
-    unsigned int               i;
 
-    struct VP8_COMP *optr;
 
     vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
 
     if (!ctx->priv)
     {
-        priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+        struct vpx_codec_alg_priv *priv =
+            (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv));
 
         if (!priv)
         {
             return VPX_CODEC_MEM_ERROR;
         }
 
-        ctx->priv = &priv->base;
-        ctx->priv->sz = sizeof(*ctx->priv);
-        ctx->priv->iface = ctx->iface;
-        ctx->priv->alg_priv = priv;
+        ctx->priv = (vpx_codec_priv_t *)priv;
         ctx->priv->init_flags = ctx->init_flags;
 
         if (ctx->config.enc)
@@ -659,21 +670,11 @@
             /* Update the reference to the config structure to an
              * internal copy.
              */
-            ctx->priv->alg_priv->cfg = *ctx->config.enc;
-            ctx->config.enc = &ctx->priv->alg_priv->cfg;
+            priv->cfg = *ctx->config.enc;
+            ctx->config.enc = &priv->cfg;
         }
 
-        cfg =  &ctx->priv->alg_priv->cfg;
-
-        /* Select the extra vp8 configuration table based on the current
-         * usage value. If the current usage value isn't found, use the
-         * values for usage case 0.
-         */
-        for (i = 0;
-             extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-             i++);
-
-        priv->vp8_cfg = extracfg_map[i].cfg;
+        priv->vp8_cfg = default_extracfg;
         priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
 
         priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
@@ -696,17 +697,10 @@
 
         if (!res)
         {
-            set_vp8e_config(&ctx->priv->alg_priv->oxcf,
-                             ctx->priv->alg_priv->cfg,
-                             ctx->priv->alg_priv->vp8_cfg,
-                             mr_cfg);
-
-            optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
-
-            if (!optr)
+            set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
+            priv->cpi = vp8_create_compressor(&priv->oxcf);
+            if (!priv->cpi)
                 res = VPX_CODEC_MEM_ERROR;
-            else
-                ctx->priv->alg_priv->cpi = optr;
         }
     }
 
@@ -727,24 +721,30 @@
 
     free(ctx->cx_data);
     vp8_remove_compressor(&ctx->cpi);
-    free(ctx);
+    vpx_free(ctx);
     return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
                                        YV12_BUFFER_CONFIG  *yv12)
 {
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
     vpx_codec_err_t        res = VPX_CODEC_OK;
     yv12->y_buffer = img->planes[VPX_PLANE_Y];
     yv12->u_buffer = img->planes[VPX_PLANE_U];
     yv12->v_buffer = img->planes[VPX_PLANE_V];
 
-    yv12->y_crop_width  = img->d_w;
-    yv12->y_crop_height = img->d_h;
-    yv12->y_width  = img->d_w;
-    yv12->y_height = img->d_h;
-    yv12->uv_width = (1 + yv12->y_width) / 2;
-    yv12->uv_height = (1 + yv12->y_height) / 2;
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
 
     yv12->y_stride = img->stride[VPX_PLANE_Y];
     yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -796,27 +796,9 @@
     }
 }
 
-
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
-                                   const vpx_image_t     *img,
-                                   vpx_codec_pts_t        pts,
-                                   unsigned long          duration,
-                                   vpx_enc_frame_flags_t  flags,
-                                   unsigned long          deadline)
+static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
+                                                int flags)
 {
-    vpx_codec_err_t res = VPX_CODEC_OK;
-
-    if (!ctx->cfg.rc_target_bitrate)
-        return res;
-
-    if (img)
-        res = validate_img(ctx, img);
-
-    if (!res)
-        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
-
-    pick_quickcompress_mode(ctx, duration, deadline);
-    vpx_codec_pkt_list_init(&ctx->pkt_list);
 
     /* Handle Flags */
     if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
@@ -866,6 +848,39 @@
         vp8_update_entropy(ctx->cpi, 0);
     }
 
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if (!ctx->cfg.rc_target_bitrate)
+        return res;
+
+    if (img)
+        res = validate_img(ctx, img);
+
+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+    pick_quickcompress_mode(ctx, duration, deadline);
+    vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+    // If no flags are set in the encode call, then use the frame flags as
+    // defined via the control function: vp8e_set_frame_flags.
+    if (!flags) {
+        flags = ctx->control_frame_flags;
+    }
+    ctx->control_frame_flags = 0;
+
+    res = set_reference_and_update(ctx, flags);
+
     /* Handle fixed keyframe intervals */
     if (ctx->cfg.kf_mode == VPX_KF_AUTO
         && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
@@ -1168,6 +1183,25 @@
     return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
+                                            va_list args)
+{
+    int frame_flags = va_arg(args, int);
+    ctx->control_frame_flags = frame_flags;
+    return set_reference_and_update(ctx, frame_flags);
+}
+
+static vpx_codec_err_t vp8e_set_temporal_layer_id(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args)
+{
+    int layer_id = va_arg(args, int);
+    if (layer_id < 0 || layer_id >= (int)ctx->cfg.ts_number_layers) {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+    ctx->cpi->temporal_layer_id = layer_id;
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args)
 {
@@ -1242,6 +1276,8 @@
     {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
     {VP8E_UPD_REFERENCE,                vp8e_update_reference},
     {VP8E_USE_REFERENCE,                vp8e_use_reference},
+    {VP8E_SET_FRAME_FLAGS,              vp8e_set_frame_flags},
+    {VP8E_SET_TEMPORAL_LAYER_ID,        vp8e_set_temporal_layer_id},
     {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
     {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
     {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
@@ -1259,6 +1295,7 @@
     {VP8E_SET_TUNING,                   set_tuning},
     {VP8E_SET_CQ_LEVEL,                 set_cq_level},
     {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_rc_max_intra_bitrate_pct},
+    {VP8E_SET_SCREEN_CONTENT_MODE,      set_screen_content_mode},
     { -1, NULL},
 };
 
@@ -1273,6 +1310,9 @@
 
         320,                /* g_width */
         240,                /* g_height */
+        VPX_BITS_8,         /* g_bit_depth */
+        8,                  /* g_input_bit_depth */
+
         {1, 30},            /* g_timebase */
 
         0,                  /* g_error_resilient */
@@ -1289,10 +1329,8 @@
         30,                 /* rc_resize_up_thresold */
 
         VPX_VBR,            /* rc_end_usage */
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
         {0},                /* rc_twopass_stats_in */
         {0},                /* rc_firstpass_mb_stats_in */
-#endif
         256,                /* rc_target_bandwidth */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
@@ -1312,9 +1350,6 @@
         0,                  /* kf_min_dist */
         128,                /* kf_max_dist */
 
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-        "vp8.fpf"           /* first pass filename */
-#endif
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         {0},
         {0},                /* ss_target_bitrate */
@@ -1341,18 +1376,19 @@
     vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
     vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
     {
-        NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
-        NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_peek_si_fn_t    peek_si; */
+        NULL,    /* vpx_codec_get_si_fn_t     get_si; */
+        NULL,    /* vpx_codec_decode_fn_t     decode; */
+        NULL,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
     },
     {
         1,                  /* 1 cfg map */
-        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
         vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
         vp8e_set_config,
-        NOT_IMPLEMENTED,
+        NULL,
         vp8e_get_preview,
         vp8e_mr_alloc_mem,
     } /* encoder functions */

diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index 9a0cdb7..72e4770 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c

@@ -11,7 +11,9 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -60,7 +62,6 @@
     vpx_decrypt_cb          decrypt_cb;
     void                    *decrypt_state;
     vpx_image_t             img;
-    int                     flushed;
     int                     img_setup;
     struct frame_buffers    yv12_frame_buffers;
     void                    *user_priv;
@@ -75,73 +76,72 @@
      * known)
      */
     (void)si;
+    (void)flags;
     return sizeof(vpx_codec_alg_priv_t);
 }
 
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
 {
-    ctx->priv =
-        (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
-    vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
-    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
-    ctx->priv->alg_priv->decrypt_cb = NULL;
-    ctx->priv->alg_priv->decrypt_state = NULL;
-    ctx->priv->alg_priv->flushed = 0;
+    vpx_codec_alg_priv_t *priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
 
+    priv->si.sz = sizeof(priv->si);
+    priv->decrypt_cb = NULL;
+    priv->decrypt_state = NULL;
+
     if (ctx->config.dec)
     {
         /* Update the reference to the config structure to an internal copy. */
-        ctx->priv->alg_priv->cfg = *ctx->config.dec;
-        ctx->config.dec = &ctx->priv->alg_priv->cfg;
+        priv->cfg = *ctx->config.dec;
+        ctx->config.dec = &priv->cfg;
     }
 }
 
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data)
 {
-    vpx_codec_err_t        res = VPX_CODEC_OK;
+    vpx_codec_err_t res = VPX_CODEC_OK;
+    vpx_codec_alg_priv_t *priv = NULL;
     (void) data;
 
     vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
 
     /* This function only allocates space for the vpx_codec_alg_priv_t
      * structure. More memory may be required at the time the stream
      * information becomes known.
      */
-    if (!ctx->priv)
-    {
-        vp8_init_ctx(ctx);
+    if (!ctx->priv) {
+      vp8_init_ctx(ctx);
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
-        /* initialize number of fragments to zero */
-        ctx->priv->alg_priv->fragments.count = 0;
-        /* is input fragments enabled? */
-        ctx->priv->alg_priv->fragments.enabled =
-                (ctx->priv->alg_priv->base.init_flags &
-                    VPX_CODEC_USE_INPUT_FRAGMENTS);
+      /* initialize number of fragments to zero */
+      priv->fragments.count = 0;
+      /* is input fragments enabled? */
+      priv->fragments.enabled =
+          (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
 
-        /*post processing level initialized to do nothing */
+      /*post processing level initialized to do nothing */
+    } else {
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
     }
 
-    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
-            (ctx->priv->alg_priv->base.init_flags &
-                    VPX_CODEC_USE_FRAME_THREADING);
+    priv->yv12_frame_buffers.use_frame_threads =
+        (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
 
     /* for now, disable frame threading */
-    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0;
+    priv->yv12_frame_buffers.use_frame_threads = 0;
 
-    if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads &&
-            (( ctx->priv->alg_priv->base.init_flags &
-                            VPX_CODEC_USE_ERROR_CONCEALMENT)
-                    || ( ctx->priv->alg_priv->base.init_flags &
-                            VPX_CODEC_USE_INPUT_FRAGMENTS) ) )
-    {
-        /* row-based threading, error concealment, and input fragments will
-         * not be supported when using frame-based threading */
-        res = VPX_CODEC_INVALID_PARAM;
+    if (priv->yv12_frame_buffers.use_frame_threads &&
+        ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
+      /* row-based threading, error concealment, and input fragments will
+       * not be supported when using frame-based threading */
+      res = VPX_CODEC_INVALID_PARAM;
     }
 
     return res;
@@ -192,7 +192,7 @@
 
             /* vet via sync code */
             if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
-                res = VPX_CODEC_UNSUP_BITSTREAM;
+                return VPX_CODEC_UNSUP_BITSTREAM;
 
             si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
             si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -290,8 +290,8 @@
     if (ctx->fragments.count == 0)
     {
         /* New frame, reset fragment pointers and sizes */
-        vpx_memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
-        vpx_memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+        memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+        memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
     }
     if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
     {
@@ -310,6 +310,11 @@
         return 0;
     }
 
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
+    }
+
     if (!ctx->fragments.enabled)
     {
         ctx->fragments.ptrs[0] = data;
@@ -330,14 +335,11 @@
     unsigned int resolution_change = 0;
     unsigned int w, h;
 
-    if (data == NULL && data_sz == 0) {
-      ctx->flushed = 1;
-      return VPX_CODEC_OK;
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
     }
 
-    /* Reset flushed when receiving a valid frame */
-    ctx->flushed = 0;
-
     /* Update the input fragment data */
     if(update_fragments(ctx, data, data_sz, &res) <= 0)
         return res;
@@ -404,7 +406,7 @@
     if (!res)
     {
         VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
-        if(resolution_change)
+        if (resolution_change)
         {
             VP8_COMMON *const pc = & pbi->common;
             MACROBLOCKD *const xd  = & pbi->mb;
@@ -566,17 +568,23 @@
 static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
                                        YV12_BUFFER_CONFIG  *yv12)
 {
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
     vpx_codec_err_t        res = VPX_CODEC_OK;
     yv12->y_buffer = img->planes[VPX_PLANE_Y];
     yv12->u_buffer = img->planes[VPX_PLANE_U];
     yv12->v_buffer = img->planes[VPX_PLANE_V];
 
-    yv12->y_crop_width  = img->d_w;
-    yv12->y_crop_height = img->d_h;
-    yv12->y_width  = img->d_w;
-    yv12->y_height = img->d_h;
-    yv12->uv_width = yv12->y_width / 2;
-    yv12->uv_height = yv12->y_height / 2;
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
 
     yv12->y_stride = img->stride[VPX_PLANE_Y];
     yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -644,6 +652,8 @@
         return VPX_CODEC_INVALID_PARAM;
 
 #else
+    (void)ctx;
+    (void)args;
     return VPX_CODEC_INCAPABLE;
 #endif
 }
@@ -809,15 +819,15 @@
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
-        NOT_IMPLEMENTED,
+        NULL,
     },
     { /* encoder functions */
         0,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL
     }
 };

diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
index a0dbdcf..857a631 100644
--- a/libvpx/vp8/vp8cx.mk
+++ b/libvpx/vp8/vp8cx.mk

@@ -60,12 +60,11 @@
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/quantize.c
+VP8_CX_SRCS-yes += encoder/vp8_quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/dct_value_cost.h
 VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -75,7 +74,6 @@
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
-VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@@ -83,11 +81,10 @@
 endif
 
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
 
@@ -95,18 +92,26 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 endif
 
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 endif
 
-VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 
-$(eval $(call asm_offsets_template,\
-         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c))
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
+endif
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+endif
+
+VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))

diff --git a/libvpx/vp8/vp8cx_arm.mk b/libvpx/vp8/vp8cx_arm.mk
index 5733048..838b53d 100644
--- a/libvpx/vp8/vp8cx_arm.mk
+++ b/libvpx/vp8/vp8cx_arm.mk

@@ -14,33 +14,15 @@
 #File list for arm
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
-
-#File list for edsp
-# encoder
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS_REMOVE-$(HAVE_EDSP)  += encoder/boolhuff.c
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
 
 #File list for media
 # encoder
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
 # encoder
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
-
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c

diff --git a/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
deleted file mode 100644
index 60a0d98..0000000
--- a/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ /dev/null

@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-;                            uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0  int input_dc
-; r1  uint8_t *pred_ptr
-; r2  uint8_t *dst_ptr
-; r3  int pitch
-; sp  int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    mul              r0, r0, r12               ; input_dc * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.16         q0, r0;                   ; duplicate a1
-    ldr              r12, [sp]                 ; load stride
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1                    ; clip_pixel
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx               lr
-    ENDP             ; |vp9_dc_only_idct_add_neon|
-
-    END

diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e2..0000000
--- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null

@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_iht4x4_16_add_neon|
-
-    END

diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 0000000..1761fad
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c

@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static INLINE void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static INLINE void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f566..0000000
--- a/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null

@@ -1,698 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht8x8_64_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-    vpush           {d8-d15}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
-    vpop           {d8-d15}
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_iht8x8_64_add_neon|
-
-    END

diff --git a/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 0000000..04b342c
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c

@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static INLINE void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
deleted file mode 100644
index 6ebea9f..0000000
--- a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-#define CROP_WIDTH 512
-extern uint8_t *vp9_ff_cropTbl;
-
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
-                                                                               \
-  int32_t tmp, out;                                                            \
-  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
-  int     in = input;                                                          \
-                                                                               \
-  __asm__ __volatile__ (                                                       \
-      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
-      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
-      "mthi     $zero,                  $ac1                              \n\t"\
-      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
-      "extp     %[tmp],                 $ac1,             31              \n\t"\
-                                                                               \
-      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
-      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
-      "mthi     $zero,                  $ac2                              \n\t"\
-      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
-      "extp     %[out],                 $ac2,             31              \n\t"\
-                                                                               \
-      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
-      : [in] "r" (in),                                                         \
-        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
-        [cospi_16_64] "r" (cospi_16_64)                                        \
-   );                                                                          \
-  out;                                                                    })
-
-static INLINE void vp9_prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   0,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   1,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
-static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   4,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   5,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
-void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride);
-
-void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h);
-
-void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h);
-
-void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h);
-
-void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter,
-                         int w, int h);
-
-void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 19c582f..6ca83a0 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

@@ -16,1072 +16,11 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
-                              uint32_t no_rows) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_10, step1_11, step1_12, step1_13;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-
-  for (i = no_rows; i--; ) {
-    /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 16));
-
-    __asm__ __volatile__ (
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             4(%[input])                     \n\t"
-        "lh       %[load6],             28(%[input])                    \n\t"
-        "lh       %[load7],             20(%[input])                    \n\t"
-        "lh       %[load8],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
-        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
-        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
-        "sh       %[load5],             0(%[output])                    \n\t"
-        "sh       %[load6],             32(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "sh       %[load5],             192(%[output])                  \n\t"
-        "sh       %[load6],             224(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
-        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "sh       %[load5],             256(%[output])                  \n\t"
-        "sh       %[load6],             288(%[output])                  \n\t"
-        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
-        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
-        "sh       %[load5],             448(%[output])                  \n\t"
-        "sh       %[load6],             480(%[output])                  \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
-    );
-
-    __asm__ __volatile__ (
-        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
-        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
-        "sh       %[load5],             64(%[output])                   \n\t"
-        "sh       %[load6],             96(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
-        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
-        "sh       %[load5],             128(%[output])                  \n\t"
-        "sh       %[load6],             160(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
-        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
-        "sh       %[load5],             320(%[output])                  \n\t"
-        "sh       %[load6],             352(%[output])                  \n\t"
-        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
-        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
-        "sh       %[load5],             384(%[output])                  \n\t"
-        "sh       %[load6],             416(%[output])                  \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
-    );
-
-    input += 16;
-    output += 1;
-  }
-}
-
-static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                      int dest_stride) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_8, step1_9, step1_10, step1_11;
-  int step1_12, step1_13, step1_14, step1_15;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = vp9_ff_cropTbl;
-
-  /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
-
-  for (i = 0; i < 16; ++i) {
-    dest_pix = (dest + i);
-    __asm__ __volatile__ (
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
-        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
-        "extp     %[result4],           $ac2,            31             \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
-        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
-        "extp     %[result1],           $ac1,        31                 \n\t"
-
-        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
-        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
-        "extp     %[result2],           $ac3,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
-        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
-        "extp     %[result3],           $ac1,        31                 \n\t"
-
-        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
-        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
-        "extp     %[result4],           $ac2,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
-        "lh       %[load5],             4(%[input])                   \n\t"
-        "lh       %[load6],             28(%[input])                  \n\t"
-        "lh       %[load7],             20(%[input])                  \n\t"
-        "lh       %[load8],             12(%[input])                  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
-        "mthi     $zero,                $ac3                          \n\t"
-
-        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
-        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
-        "extp     %[result1],           $ac1,        31               \n\t"
-
-        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
-        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
-        "extp     %[result2],           $ac3,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
-        "mthi     $zero,                $ac2                          \n\t"
-
-        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
-        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
-        "extp     %[result3],           $ac1,        31               \n\t"
-
-        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
-        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
-        "extp     %[result4],           $ac2,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    step1_8 = step2_8 + step2_11;
-    step1_9 = step2_9 + step2_10;
-    step1_14 = step2_13 + step2_14;
-    step1_15 = step2_12 + step2_15;
-
-    __asm__ __volatile__ (
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-
-        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
-          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
-    );
-
-    input += 16;
-  }
-}
-
-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct16_rows_dspr2(input, out, 16);
-
-  // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-static void iadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
-
-  output[0] =  x0;
-  output[1] = -x8;
-  output[2] =  x12;
-  output[3] = -x4;
-  output[4] =  x6;
-  output[5] =  x14;
-  output[6] =  x10;
-  output[7] =  x2;
-  output[8] =  x3;
-  output[9] =  x11;
-  output[10] =  x15;
-  output[11] =  x7;
-  output[12] =  x5;
-  output[13] = -x13;
-  output[14] =  x9;
-  output[15] = -x1;
-}
-
 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
                                 int pitch, int tx_type) {
   int i, j;
@@ -1108,7 +47,7 @@
       outptr = out;
 
       for (i = 0; i < 16; ++i) {
-        iadst16(outptr, temp_out);
+        iadst16_dspr2(outptr, temp_out);
 
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
@@ -1123,9 +62,9 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16(input, outptr);
+        iadst16_dspr2(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1143,9 +82,9 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
-        iadst16(input, outptr);
+        iadst16_dspr2(input, outptr);
         input += 16;
         outptr += 16;
       }
@@ -1153,7 +92,7 @@
       for (i = 0; i < 16; ++i) {
         for (j = 0; j < 16; ++j)
           temp_in[j] = out[j * 16 + i];
-        iadst16(temp_in, temp_out);
+        iadst16_dspr2(temp_in, temp_out);
         for (j = 0; j < 16; ++j)
           dest[j * pitch + i] =
                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
@@ -1166,150 +105,4 @@
       break;
   }
 }
-
-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-  for (i = 0; i < 6; ++i) {
-    __asm__ __volatile__ (
-        "sw     $zero,    0(%[outptr])     \n\t"
-        "sw     $zero,   32(%[outptr])     \n\t"
-        "sw     $zero,   64(%[outptr])     \n\t"
-        "sw     $zero,   96(%[outptr])     \n\t"
-        "sw     $zero,  128(%[outptr])     \n\t"
-        "sw     $zero,  160(%[outptr])     \n\t"
-        "sw     $zero,  192(%[outptr])     \n\t"
-        "sw     $zero,  224(%[outptr])     \n\t"
-        "sw     $zero,  256(%[outptr])     \n\t"
-        "sw     $zero,  288(%[outptr])     \n\t"
-        "sw     $zero,  320(%[outptr])     \n\t"
-        "sw     $zero,  352(%[outptr])     \n\t"
-        "sw     $zero,  384(%[outptr])     \n\t"
-        "sw     $zero,  416(%[outptr])     \n\t"
-        "sw     $zero,  448(%[outptr])     \n\t"
-        "sw     $zero,  480(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r" (outptr)
-    );
-
-    outptr += 2;
-  }
-
-  // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],     32      \n\t"
-      "sra      %[a1],      %[out],     6       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
 #endif  // #if HAVE_DSPR2

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 1990348..c10979b 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

@@ -16,352 +16,11 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int       i;
-
-  for (i = 4; i--; ) {
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-
-        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp1],             8(%[output])                    \n\t"
-
-        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp2],             16(%[output])                   \n\t"
-
-        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp3],             24(%[output])                   \n\t"
-
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [output] "+r" (output)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input)
-    );
-
-    input += 4;
-    output += 1;
-  }
-}
-
-static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                               int dest_stride) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int       i;
-  uint8_t   *dest_pix;
-  uint8_t   *cm = vp9_ff_cropTbl;
-
-  /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
-
-  for (i = 0; i < 4; ++i) {
-      dest_pix = (dest + i);
-
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [dest_pix] "+r" (dest_pix)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
-
-    input += 4;
-  }
-}
-
-void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // Rows
-  vp9_idct4_rows_dspr2(input, outptr);
-
-  // Columns
-  vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  int       a1, absa1;
-  int       r;
-  int32_t   out;
-  int       t2, vector_a1, vector_a;
-  uint32_t  pos = 45;
-  int16_t   input_dc = input[0];
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],    8       \n\t"
-      "sra      %[a1],      %[out],    4       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__ (
-          "lw             %[t2],          0(%[dest])                      \n\t"
-          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
-          "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb       %[vector_a1],   %[a1]     \n\t"
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t2],          0(%[dest])                        \n\t"
-          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
-          "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
-
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
-
-static void iadst4_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
-}
-
 void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
                              int dest_stride, int tx_type) {
   int i, j;
@@ -379,11 +38,11 @@
 
   switch (tx_type) {
     case DCT_DCT:   // DCT in both horizontal and vertical
-      vp9_idct4_rows_dspr2(input, outptr);
-      vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vpx_idct4_rows_dspr2(input, outptr);
+      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
-      vp9_idct4_rows_dspr2(input, outptr);
+      vpx_idct4_rows_dspr2(input, outptr);
 
       outptr = out;
 
@@ -410,7 +69,7 @@
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index fc44ffa..37f3ca9 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

@@ -15,537 +15,11 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
-                             uint32_t no_rows) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  const int const_2_power_13 = 8192;
-  int Temp0, Temp1, Temp2, Temp3, Temp4;
-  int i;
-
-  for (i = no_rows; i--; ) {
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[Temp4],             $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp1],             16(%[output])                   \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp0],             32(%[output])                   \n\t"
-        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp1],             48(%[output])                   \n\t"
-
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp0],             64(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp1],             80(%[output])                   \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp0],             96(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp1],             112(%[output])                  \n\t"
-
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [Temp4] "=&r" (Temp4)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [output] "r" (output), [input] "r" (input)
-    );
-
-    input += 8;
-    output += 1;
-  }
-}
-
-static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                        int dest_stride) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int Temp0, Temp1, Temp2, Temp3;
-  int i;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = vp9_ff_cropTbl;
-
-  /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
-
-  for (i = 0; i < 8; ++i) {
-      dest_pix = (dest + i);
-
-    __asm__ __volatile__ (
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_6],           $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /* add block */
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [dest_pix] "+r" (dest_pix)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
-
-    input += 8;
-  }
-}
-
-void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 8);
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-static void iadst8_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3, x4, x5, x6, x7;
-
-  x0 = input[7];
-  x1 = input[0];
-  x2 = input[5];
-  x3 = input[2];
-  x4 = input[3];
-  x5 = input[4];
-  x6 = input[1];
-  x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
-  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
-  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
-  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
-
-  output[0] =  x0;
-  output[1] = -x4;
-  output[2] =  x6;
-  output[3] = -x2;
-  output[4] =  x3;
-  output[5] = -x7;
-  output[6] =  x5;
-  output[7] = -x1;
-}
-
 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
                              int dest_stride, int tx_type) {
   int i, j;
@@ -616,130 +90,4 @@
       break;
   }
 }
-
-void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-
-  __asm__ __volatile__ (
-      "sw  $zero,   0(%[outptr])  \n\t"
-      "sw  $zero,   4(%[outptr])  \n\t"
-      "sw  $zero,  16(%[outptr])  \n\t"
-      "sw  $zero,  20(%[outptr])  \n\t"
-      "sw  $zero,  32(%[outptr])  \n\t"
-      "sw  $zero,  36(%[outptr])  \n\t"
-      "sw  $zero,  48(%[outptr])  \n\t"
-      "sw  $zero,  52(%[outptr])  \n\t"
-      "sw  $zero,  64(%[outptr])  \n\t"
-      "sw  $zero,  68(%[outptr])  \n\t"
-      "sw  $zero,  80(%[outptr])  \n\t"
-      "sw  $zero,  84(%[outptr])  \n\t"
-      "sw  $zero,  96(%[outptr])  \n\t"
-      "sw  $zero, 100(%[outptr])  \n\t"
-      "sw  $zero, 112(%[outptr])  \n\t"
-      "sw  $zero, 116(%[outptr])  \n\t"
-
-      :
-      : [outptr] "r" (outptr)
-  );
-
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t t1, t2, vector_a1, vector_1, vector_2;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-
-    :
-    : [pos] "r" (pos)
-  );
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
-      "addi     %[out],     %[out],     16      \n\t"
-      "sra      %[a1],      %[out],     5       \n\t"
-
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__ (
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
-    }
-  }
-}
 #endif  // #if HAVE_DSPR2

diff --git a/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
new file mode 100644
index 0000000..5adf0aa
--- /dev/null
+++ b/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c

@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride, int32_t tx_type) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *out_ptr = &out[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    case DCT_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}

diff --git a/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
new file mode 100644
index 0000000..75977b1
--- /dev/null
+++ b/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^3, divide by 2^4) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  /* add block and store 4x4 */
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}

diff --git a/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
new file mode 100644
index 0000000..65d2993
--- /dev/null
+++ b/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c

@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}

diff --git a/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c b/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c
new file mode 100644
index 0000000..7257cd6
--- /dev/null
+++ b/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c

@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  uint64_t src0_d, src1_d, dst0_d, dst1_d;
+  v16i8 src0 = { 0 };
+  v16i8 src1 = { 0 };
+  v16i8 dst0 = { 0 };
+  v16i8 dst1 = { 0 };
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 2; row--;) {
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src0);
+    INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src1);
+    INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst0, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst1, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+  }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+                                      int32_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      int32_t dst_stride,
+                                      int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 4; row--;) {
+    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src2, src_r, src_l);
+    UNPCK_UB_SH(dst2, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src3, src_r, src_l);
+    UNPCK_UB_SH(dst3, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   int src_weight) {
+  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}

diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index b379656..ac417b6 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c

@@ -11,21 +11,28 @@
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_systemdependent.h"
 
-static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
-  int i;
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
 
-  // Top border row
-  vpx_memset(mi, 0, sizeof(*mi) * cm->mi_stride);
-
-  // Left border column
-  for (i = 1; i < cm->mi_rows + 1; ++i)
-    vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi));
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
 }
 
 void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
@@ -34,192 +41,142 @@
 
   cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
 
   cm->mb_cols = (cm->mi_cols + 1) >> 1;
   cm->mb_rows = (cm->mi_rows + 1) >> 1;
   cm->MBs = cm->mb_rows * cm->mb_cols;
 }
 
-static void setup_mi(VP9_COMMON *cm) {
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
-
-  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-
-  vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
-                                      sizeof(*cm->mi_grid_base));
-
-  clear_mi_border(cm, cm->prev_mip);
-}
-
-static int alloc_mi(VP9_COMMON *cm, int mi_size) {
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
   int i;
 
-  for (i = 0; i < 2; ++i) {
-    cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
-    if (cm->mip_array[i] == NULL)
-      return 1;
-
-    cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-    if (cm->mi_grid_base_array[i] == NULL)
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
       return 1;
   }
+  cm->seg_map_alloc_size = seg_map_size;
 
   // Init the index.
-  cm->mi_idx = 0;
-  cm->prev_mi_idx = 1;
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
 
-  cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
-  cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
-  cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  if (!cm->frame_parallel_decode)
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
   return 0;
 }
 
-static void free_mi(VP9_COMMON *cm) {
+static void free_seg_map(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < 2; ++i) {
-    vpx_free(cm->mip_array[i]);
-    cm->mip_array[i] = NULL;
-    vpx_free(cm->mi_grid_base_array[i]);
-    cm->mi_grid_base_array[i] = NULL;
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
   }
 
-  cm->mip = NULL;
-  cm->prev_mip = NULL;
-  cm->mi_grid_base = NULL;
-  cm->prev_mi_grid_base = NULL;
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
 }
 
-void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
+void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
-
-    if (cm->frame_bufs[i].ref_count > 0 &&
-        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
-      cm->frame_bufs[i].ref_count = 0;
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
     }
+    vpx_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    vpx_free_frame_buffer(&pool->frame_bufs[i].buf);
   }
+}
 
-  vp9_free_frame_buffer(&cm->post_proc_buffer);
+void vp9_free_postproc_buffers(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vpx_free_frame_buffer(&cm->post_proc_buffer);
+  vpx_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+  (void)cm;
+#endif
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  free_mi(cm);
-
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
-
+  cm->free_mi(cm);
+  free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;
-
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
 }
 
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
-  vp9_free_context_buffers(cm);
+  int new_mi_size;
 
   vp9_set_mb_mi(cm, width, height);
-  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail;
-
-  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map) goto fail;
-
-  cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-      2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-      sizeof(*cm->above_context));
-  if (!cm->above_context) goto fail;
-
-  cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
-      mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
-  if (!cm->above_seg_context) goto fail;
-
-  return 0;
-
- fail:
-  vp9_free_context_buffers(cm);
-  return 1;
-}
-
-static void init_frame_bufs(VP9_COMMON *cm) {
-  int i;
-
-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
-}
-
-int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i;
-  const int ss_x = cm->subsampling_x;
-  const int ss_y = cm->subsampling_y;
-
-  vp9_free_ref_frame_buffers(cm);
-
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    cm->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
-                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size))
       goto fail;
   }
 
-  init_frame_bufs(cm);
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+      goto fail;
+  }
 
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
-  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                             VP9_ENC_BORDER_IN_PIXELS) < 0)
-    goto fail;
-#endif
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
+    vpx_free(cm->above_context);
+    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+        sizeof(*cm->above_context));
+    if (!cm->above_context) goto fail;
+
+    vpx_free(cm->above_seg_context);
+    cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+    if (!cm->above_seg_context) goto fail;
+    cm->above_context_alloc_cols = cm->mi_cols;
+  }
 
   return 0;
 
  fail:
-  vp9_free_ref_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
   return 1;
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
-  vp9_free_ref_frame_buffers(cm);
   vp9_free_context_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
+
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
 }
 
 void vp9_init_context_buffers(VP9_COMMON *cm) {
-  setup_mi(cm);
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+  cm->setup_mi(cm);
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
 
-void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
   // Swap indices.
-  const int tmp = cm->mi_idx;
-  cm->mi_idx = cm->prev_mi_idx;
-  cm->prev_mi_idx = tmp;
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
 
-  // Current mip will be the prev_mip for the next frame.
-  cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
-  cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
-  cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
-
-  // Update the upper left visible macroblock ptrs.
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
-  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 }

diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h
index c5b893f..c0e51a6 100644
--- a/libvpx/vp9/common/vp9_alloccommon.h
+++ b/libvpx/vp9/common/vp9_alloccommon.h

@@ -12,11 +12,14 @@
 #ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
 #define VP9_COMMON_VP9_ALLOCCOMMON_H_
 
+#define INVALID_IDX -1  // Invalid buffer index.
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct VP9Common;
+struct BufferPool;
 
 void vp9_remove_common(struct VP9Common *cm);
 
@@ -24,14 +27,15 @@
 void vp9_init_context_buffers(struct VP9Common *cm);
 void vp9_free_context_buffers(struct VP9Common *cm);
 
-int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height);
-void vp9_free_ref_frame_buffers(struct VP9Common *cm);
+void vp9_free_ref_frame_buffers(struct BufferPool *pool);
+void vp9_free_postproc_buffers(struct VP9Common *cm);
 
 int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
 void vp9_free_state_buffers(struct VP9Common *cm);
 
 void vp9_set_mb_mi(struct VP9Common *cm, int width, int height);
-void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);
+
+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/common/vp9_blockd.c b/libvpx/vp9/common/vp9_blockd.c
index dab8f96..e8334fc 100644
--- a/libvpx/vp9/common/vp9_blockd.c
+++ b/libvpx/vp9/common/vp9_blockd.c

@@ -50,39 +50,26 @@
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const int step = 1 << (tx_size << 1);
-  int i;
+  int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    int r, c;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step;
 
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
-
-    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-    // it to 4x4 block sizes.
-    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-    i = 0;
-    // Unlike the normal case - in here we have to keep track of the
-    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
-        i += step;
-      }
-    }
-  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
       visit(plane, i, plane_bsize, tx_size, arg);
+      i += step;
+    }
+    i += extra_step;
   }
 }
 
@@ -92,7 +79,7 @@
                                    void *arg) {
   int plane;
 
-  for (plane = 0; plane < MAX_MB_PLANE; plane++)
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
@@ -117,7 +104,7 @@
     for (i = above_contexts; i < tx_size_in_blocks; ++i)
       a[i] = 0;
   } else {
-    vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 
   // left
@@ -134,7 +121,7 @@
     for (i = left_contexts; i < tx_size_in_blocks; ++i)
       l[i] = 0;
   } else {
-    vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 

diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 951e6e0..d776b44 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h

@@ -17,43 +17,19 @@
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
 
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define BLOCK_SIZE_GROUPS 4
-#define SKIP_CONTEXTS 3
-#define INTER_MODE_CONTEXTS 7
-
-/* Segment Feature Masks */
-#define MAX_MV_REF_CANDIDATES 2
-
-#define INTRA_INTER_CONTEXTS 4
-#define COMP_INTER_CONTEXTS 5
-#define REF_CONTEXTS 5
-
-typedef enum {
-  PLANE_TYPE_Y  = 0,
-  PLANE_TYPE_UV = 1,
-  PLANE_TYPES
-} PLANE_TYPE;
-
-typedef char ENTROPY_CONTEXT;
-
-typedef char PARTITION_CONTEXT;
-
-static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
-                                           ENTROPY_CONTEXT b) {
-  return (a != 0) + (b != 0);
-}
+#define MAX_MB_PLANE 3
 
 typedef enum {
   KEY_FRAME = 0,
@@ -61,34 +37,10 @@
   FRAME_TYPES,
 } FRAME_TYPE;
 
-typedef enum {
-  DC_PRED,         // Average of above and left pixels
-  V_PRED,          // Vertical
-  H_PRED,          // Horizontal
-  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,       // Directional 135 deg = 180 - 45
-  D117_PRED,       // Directional 117 deg = 180 - 63
-  D153_PRED,       // Directional 153 deg = 180 - 27
-  D207_PRED,       // Directional 207 deg = 180 + 27
-  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  TM_PRED,         // True-motion
-  NEARESTMV,
-  NEARMV,
-  ZEROMV,
-  NEWMV,
-  MB_MODE_COUNT
-} PREDICTION_MODE;
-
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
 
-#define INTRA_MODES (TM_PRED + 1)
-
-#define INTER_MODES (1 + NEWMV - NEARESTMV)
-
-#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -98,25 +50,17 @@
   int_mv as_mv[2];  // first, second inter predictor motion vectors
 } b_mode_info;
 
-typedef enum {
-  NONE = -1,
-  INTRA_FRAME = 0,
-  LAST_FRAME = 1,
-  GOLDEN_FRAME = 2,
-  ALTREF_FRAME = 3,
-  MAX_REF_FRAMES = 4
-} MV_REFERENCE_FRAME;
+// Note that the rate-distortion optimization loop, bit-stream writer, and
+// decoder implementation modules critically rely on the defined entry values
+// specified herein. They should be refactored concurrently.
 
-static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
-  return b_width_log2_lookup[sb_type];
-}
-static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
-  return b_height_log2_lookup[sb_type];
-}
-
-static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
-  return mi_width_log2_lookup[sb_type];
-}
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
+typedef int8_t MV_REFERENCE_FRAME;
 
 // This structure now relates to 8x8 block regions.
 typedef struct {
@@ -132,14 +76,14 @@
   PREDICTION_MODE uv_mode;
 
   // Only for INTER blocks
-  MV_REFERENCE_FRAME ref_frame[2];
-  int_mv mv[2];
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  uint8_t mode_context[MAX_REF_FRAMES];
   INTERP_FILTER interp_filter;
+  MV_REFERENCE_FRAME ref_frame[2];
+
+  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
+  int_mv mv[2];
 } MB_MODE_INFO;
 
-typedef struct {
+typedef struct MODE_INFO {
   MB_MODE_INFO mbmi;
   b_mode_info bmi[4];
 } MODE_INFO;
@@ -168,23 +112,29 @@
   MV_PRECISION_Q4
 };
 
-enum { MAX_MB_PLANE = 3 };
-
 struct buf_2d {
   uint8_t *buf;
   int stride;
 };
 
 struct macroblockd_plane {
-  int16_t *dqcoeff;
+  tran_low_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
   struct buf_2d dst;
   struct buf_2d pre[2];
-  const int16_t *dequant;
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
+  int16_t seg_dequant[MAX_SEGMENTS][2];
+
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+
+  // encoder
+  const int16_t *dequant;
 };
 
 #define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
@@ -199,41 +149,55 @@
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
+
+  FRAME_COUNTS *counts;
+  TileInfo tile;
 
   int mi_stride;
 
-  // A NULL indicates that the 8x8 is not part of the image
   MODE_INFO **mi;
+  MODE_INFO *left_mi;
+  MODE_INFO *above_mi;
+  MB_MODE_INFO *left_mbmi;
+  MB_MODE_INFO *above_mbmi;
 
   int up_available;
   int left_available;
 
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
+
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
+  FRAME_CONTEXT *fc;
+  int frame_parallel_decoding_mode;
+
   /* pointers to reference frames */
   RefBuffer *block_refs[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
-  /* mc buffer */
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
-
-  int lossless;
-
-  int corrupted;
-
-  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
-
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[8];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+#endif
+
+  int lossless;
+  int corrupted;
+
+  struct vpx_internal_error_info *error_info;
 } MACROBLOCKD;
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
@@ -247,8 +211,9 @@
                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
     return DCT_DCT;
+
   return intra_mode_to_tx_type_lookup[mbmi->mode];
 }
 
@@ -285,6 +250,27 @@
   return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
 }
 
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    memset(pd->above_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+    memset(pd->left_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
+  }
+}
+
+static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  return vp9_kf_y_mode_prob[above][left];
+}
+
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
@@ -302,7 +288,7 @@
 static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                             TX_SIZE tx_size, int block,
                                             int *x, int *y) {
-  const int bwl = b_width_log2(plane_bsize);
+  const int bwl = b_width_log2_lookup[plane_bsize];
   const int tx_cols_log2 = bwl - tx_size;
   const int tx_cols = 1 << tx_cols_log2;
   const int raster_mb = block >> (tx_size << 1);

diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 2788e66..76e7cd4 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h

@@ -16,49 +16,29 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vpx_ports/bitops.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
-#define ALIGN_POWER_OF_TWO(value, n) \
-    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
     assert(sizeof(dest) == sizeof(src)); \
-    vpx_memcpy(dest, src, sizeof(src));  \
+    memcpy(dest, src, sizeof(src));  \
   }
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n) {       \
     assert(sizeof(*dest) == sizeof(*src));   \
-    vpx_memcpy(dest, src, n * sizeof(*src)); \
+    memcpy(dest, src, n * sizeof(*src)); \
   }
 
-#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
-
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
+#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;

diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index d4c1b71..0bf7cbc 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c

@@ -8,35 +8,32 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common_data.h"
 
 // Log 2 conversion lookup tables for block width and height
-const int b_width_log2_lookup[BLOCK_SIZES] =
+const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
-const int b_height_log2_lookup[BLOCK_SIZES] =
+const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
   {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-const int num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
   {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
 // Log 2 conversion lookup tables for modeinfo width and height
-const int mi_width_log2_lookup[BLOCK_SIZES] =
+const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
   {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
-const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
+const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
+const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
 // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
-const int size_group_lookup[BLOCK_SIZES] =
+const uint8_t size_group_lookup[BLOCK_SIZES] =
   {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
 
-const int num_pels_log2_lookup[BLOCK_SIZES] =
+const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
   {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
 
-
 const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
   {  // 4X4
     // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64

diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index a06c9be..95a1179 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h

@@ -12,20 +12,21 @@
 #define VP9_COMMON_VP9_COMMON_DATA_H_
 
 #include "vp9/common/vp9_enums.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern const int b_width_log2_lookup[BLOCK_SIZES];
-extern const int b_height_log2_lookup[BLOCK_SIZES];
-extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
-extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
-extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES];
-extern const int size_group_lookup[BLOCK_SIZES];
-extern const int num_pels_log2_lookup[BLOCK_SIZES];
+extern const uint8_t b_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t b_height_log2_lookup[BLOCK_SIZES];
+extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t size_group_lookup[BLOCK_SIZES];
+extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];

diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
deleted file mode 100644
index d8aaf32..0000000
--- a/libvpx/vp9/common/vp9_convolve.c
+++ /dev/null

@@ -1,284 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/vp9_filter.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters,
-                           int x0_q4, int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters,
-                               int x0_q4, int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters,
-                          int y0_q4, int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters,
-                              int y0_q4, int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters,
-                     int y0_q4, int y_step_q4,
-                     int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (intermediate_height < h)
-    intermediate_height = h;
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                 x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                     x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                    y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride,
-           filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
-  assert(w <= 64);
-  assert(h <= 64);
-
-  vp9_convolve8_c(src, src_stride, temp, 64,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
-}
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int filter_x_stride,
-                         const int16_t *filter_y, int filter_y_stride,
-                         int w, int h) {
-  int r;
-
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  for (r = h; r > 0; --r) {
-    vpx_memcpy(dst, src, w);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int filter_x_stride,
-                        const int16_t *filter_y, int filter_y_stride,
-                        int w, int h) {
-  int x, y;
-
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x)
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}

diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index d2522bb..3d80103 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c

@@ -25,31 +25,29 @@
 static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  int mi_index = 0;
   MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index]->mbmi) +
-                        member_offset)));
-      mi_index++;
+              *((int*) ((char *) (&mi[0]->mbmi) +
+                                  member_offset)));
+      mi++;
     }
     fprintf(file, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(file, "\n");
 }
+
 void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
-  int mi_index = 0;
   FILE *mvs = fopen(file, "a");
   MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
@@ -57,21 +55,35 @@
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi[mi_index]->mbmi.mv[0].as_mv.col);
-      mi_index++;
+      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
+                               mi[0]->mbmi.mv[0].as_mv.col);
+      mi++;
     }
     fprintf(mvs, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(mvs, "\n");
 

diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 3a54de2..579857b 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c

@@ -15,14 +15,46 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
-const vp9_prob vp9_cat1_prob[] = { 159 };
-const vp9_prob vp9_cat2_prob[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob[] = {
+// Unconstrained Node Tree
+const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
+const vpx_prob vp9_cat1_prob[] = { 159 };
+const vpx_prob vp9_cat2_prob[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob[] = {
     254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
+#if CONFIG_VP9_HIGHBITDEPTH
+const vpx_prob vp9_cat1_prob_high10[] = { 159 };
+const vpx_prob vp9_cat2_prob_high10[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high10[] = {
+    255, 255, 254, 254, 254, 252, 249, 243,
+    230, 196, 177, 153, 140, 133, 130, 129
+};
+const vpx_prob vp9_cat1_prob_high12[] = { 159 };
+const vpx_prob vp9_cat2_prob_high12[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high12[] = {
+    255, 255, 255, 255, 254, 254, 254, 252, 249,
+    243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+#endif
 
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
@@ -101,12 +133,6 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = {
-  -EOB_MODEL_TOKEN, 2,
-  -ZERO_TOKEN, 4,
-  -ONE_TOKEN, -TWO_TOKEN,
-};
-
 // Model obtained from a 2-sided zero-centerd distribuition derived
 // from a Pareto distribution. The cdf of the distribution is:
 // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
@@ -121,7 +147,7 @@
 // by averaging :
 // vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
 //                              vp9_pareto8_full[l+1][node] ) >> 1;
-const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
   {  6,  86, 128,  11,  87,  42,  91,  52},
   {  9,  86, 129,  17,  88,  61,  94,  76},
@@ -716,22 +742,22 @@
   }
 };
 
-static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
-  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-             MODEL_NODES * sizeof(vp9_prob));
+static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
+  memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
+         MODEL_NODES * sizeof(vpx_prob));
 }
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
   if (full != model)
-    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+    memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+  vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 #define COEF_COUNT_SAT 24
@@ -745,7 +771,7 @@
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
   const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
   vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
   unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =

diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 8a10f23..a1746bc 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h

@@ -12,10 +12,10 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
 
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,20 +58,43 @@
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high10[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high10[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high10[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high10[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high10[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high10[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high12[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high12[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high12[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high12[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high12[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #define EOB_MODEL_TOKEN 3
-extern const vp9_tree_index vp9_coefmodel_tree[];
 
 typedef struct {
-  const vp9_tree_index *tree;
-  const vp9_prob *prob;
+  const vpx_tree_index *tree;
+  const vpx_prob *prob;
   int len;
   int base_val;
+  const int16_t *cost;
 } vp9_extra_bit;
 
 // indexed by token value
 extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #define DCT_MAX_VALUE           16384
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_MAX_VALUE_HIGH10    65536
+#define DCT_MAX_VALUE_HIGH12   262144
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
@@ -113,18 +136,6 @@
 void vp9_default_coef_probs(struct VP9Common *cm);
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_wide_lookup[plane_bsize]);
-    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_high_lookup[plane_bsize]);
-  }
-}
-
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
 // This macro is currently unused but may be used by certain implementations
@@ -149,16 +160,24 @@
 #define PIVOT_NODE                  2   // which node is pivot
 
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 
-typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
+typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
                                       [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
 
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
                                           [COEFF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full);
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
 
 static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
@@ -189,18 +208,6 @@
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
-  const MODE_INFO *const mi = xd->mi[0];
-
-  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
-    return &vp9_default_scan_orders[tx_size];
-  } else {
-    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
-    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 5b00b00..670348b 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c

@@ -13,7 +13,7 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
   {  // above = dc
     { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
     {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
@@ -127,7 +127,7 @@
   }
 };
 
-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
   { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
   { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
   { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
@@ -140,14 +140,14 @@
   { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
 };
 
-static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
   { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
   { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
 
-static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
   {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
   {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
@@ -160,7 +160,7 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                      [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 158,  97,  94 },  // a/l both not split
@@ -184,7 +184,7 @@
   {  12,   3,   3 },  // a/l both split
 };
 
-static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 199, 122, 141 },  // a/l both not split
@@ -208,7 +208,7 @@
   {  10,   7,   6 },  // a/l both split
 };
 
-static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
                                               [INTER_MODES - 1] = {
   {2,       173,   34},  // 0 = both zero mv
   {7,       145,   85},  // 1 = one zero mv + one a predicted mv
@@ -220,7 +220,7 @@
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -232,31 +232,31 @@
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
 
-const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
 
-const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
 
-static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
   239, 183, 119,  96,  41
 };
 
-static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = {
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
   50, 126, 123, 221, 226
 };
 
-static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
@@ -302,11 +302,11 @@
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
+static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
@@ -314,7 +314,7 @@
   { 149, 144, },
 };
 
-void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
+static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
   vp9_copy(fc->y_mode_prob, default_if_y_probs);
   vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
@@ -328,66 +328,54 @@
   vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree
+const vpx_tree_index vp9_switchable_interp_tree
                          [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
 
-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree,
-                        const vp9_prob *pre_probs, const unsigned int *counts,
-                        vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
-                   probs);
-}
-
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
-  FRAME_CONTEXT *fc = &cm->fc;
+  FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
-                                         counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+                                                  counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
-                                        counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                 counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
-                                      counts->comp_ref[i]);
+    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+                                               counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
-                                             counts->single_ref[i][j]);
+      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+    vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                counts->uv_mode[i], fc->uv_mode_prob[i]);
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
-                counts->partition[i], fc->partition_prob[i]);
+    vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
 
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
-                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+      vpx_tree_merge_probs(vp9_switchable_interp_tree,
+                           pre_fc->switchable_interp_prob[i],
+                           counts->switchable_interp[i],
+                           fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -399,23 +387,24 @@
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
-                                             branch_ct_8x8p[j]);
+        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
-                                               branch_ct_16x16p[j]);
+        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
-                                               branch_ct_32x32p[j]);
+        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+    fc->skip_probs[i] = mode_mv_merge_probs(
+        pre_fc->skip_probs[i], counts->skip[i]);
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -439,8 +428,12 @@
   int i;
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // Reset the mode ref deltas for loop filter
   vp9_zero(lf->last_ref_deltas);
@@ -451,24 +444,24 @@
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mode_probs(&cm->fc);
+  init_mode_probs(cm->fc);
   vp9_init_mv_probs(cm);
+  cm->fc->initialized = 1;
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
     for (i = 0; i < FRAME_CONTEXTS; ++i)
-      cm->frame_contexts[i] = cm->fc;
+      cm->frame_contexts[i] = *cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
   }
 
-  if (frame_is_intra_only(cm))
-    vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
-                                    sizeof(*cm->prev_mip));
-
-  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
 
   vp9_zero(cm->ref_frame_sign_bias);
 

diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 533757b..0285be1 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h

@@ -11,50 +11,55 @@
 #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
-#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define BLOCK_SIZE_GROUPS 4
+
 #define TX_SIZE_CONTEXTS 2
-#define SWITCHABLE_FILTERS 3   // number of switchable filters
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
 struct VP9Common;
 
 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
 };
 
 struct tx_counts {
   unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
   unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
   unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  unsigned int tx_totals[TX_SIZES];
 };
 
 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+  vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][2];
+  vpx_prob comp_ref_prob[REF_CONTEXTS];
   struct tx_probs tx_probs;
-  vp9_prob skip_probs[SKIP_CONTEXTS];
+  vpx_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
+  int initialized;
 } FRAME_CONTEXT;
 
-typedef struct {
+typedef struct FRAME_COUNTS {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
@@ -73,21 +78,19 @@
   nmv_context_counts mv;
 } FRAME_COUNTS;
 
-extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern const vp9_tree_index vp9_switchable_interp_tree
+extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+extern const vpx_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mode_probs(FRAME_CONTEXT *fc);
-
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
@@ -97,15 +100,6 @@
 void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
-                                               const MODE_INFO *above_mi,
-                                               const MODE_INFO *left_mi,
-                                               int block) {
-  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
-  return vp9_kf_y_mode_prob[above][left];
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 5bb0482..3acfe14 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c

@@ -11,19 +11,16 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-#define MV_COUNT_SAT 20
-#define MV_MAX_UPDATE_FACTOR 128
-
 // Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 
-const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -36,11 +33,11 @@
   -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
+const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -135,10 +132,6 @@
          (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
 }
 
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
-  return mv_class_base(c) + offset;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
@@ -183,51 +176,43 @@
   }
 }
 
-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                        const unsigned int *counts, vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
-                       MV_MAX_UPDATE_FACTOR, probs);
-}
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  nmv_context *fc = &cm->fc.nmvc;
+  nmv_context *fc = &cm->fc->nmvc;
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
+  vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                       fc->joints);
 
   for (i = 0; i < 2; ++i) {
     nmv_component *comp = &fc->comps[i];
     const nmv_component *pre_comp = &pre_fc->comps[i];
     const nmv_component_counts *c = &counts->comps[i];
 
-    comp->sign = adapt_prob(pre_comp->sign, c->sign);
-    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
-                comp->classes);
-    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
+    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                         comp->classes);
+    vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+                         comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
+      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
-                  comp->class0_fp[j]);
+      vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+                           c->class0_fp[j], comp->class0_fp[j]);
 
-    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+    vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
-      comp->hp = adapt_prob(pre_comp->hp, c->hp);
+      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
     }
   }
 }
 
 void vp9_init_mv_probs(VP9_COMMON *cm) {
-  cm->fc.nmvc = default_nmv_context;
+  cm->fc->nmvc = default_nmv_context;
 }

diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index e7033e4..8c817bf 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h

@@ -13,7 +13,10 @@
 #define VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "./vpx_config.h"
-#include "vp9/common/vp9_blockd.h"
+
+#include "vpx_dsp/prob.h"
+
+#include "vp9/common/vp9_mv.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -74,24 +77,24 @@
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[];
-extern const vp9_tree_index vp9_mv_class_tree[];
-extern const vp9_tree_index vp9_mv_class0_tree[];
-extern const vp9_tree_index vp9_mv_fp_tree[];
+extern const vpx_tree_index vp9_mv_joint_tree[];
+extern const vpx_tree_index vp9_mv_class_tree[];
+extern const vpx_tree_index vp9_mv_class0_tree[];
+extern const vpx_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
-  vp9_prob sign;
-  vp9_prob classes[MV_CLASSES - 1];
-  vp9_prob class0[CLASS0_SIZE - 1];
-  vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
-  vp9_prob fp[MV_FP_SIZE - 1];
-  vp9_prob class0_hp;
-  vp9_prob hp;
+  vpx_prob sign;
+  vpx_prob classes[MV_CLASSES - 1];
+  vpx_prob class0[CLASS0_SIZE - 1];
+  vpx_prob bits[MV_OFFSET_BITS];
+  vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vpx_prob fp[MV_FP_SIZE - 1];
+  vpx_prob class0_hp;
+  vpx_prob hp;
 } nmv_component;
 
 typedef struct {
-  vp9_prob joints[MV_JOINTS - 1];
+  vpx_prob joints[MV_JOINTS - 1];
   nmv_component comps[2];
 } nmv_context;
 
@@ -104,8 +107,6 @@
 }
 
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
-
 
 typedef struct {
   unsigned int sign[2];

diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index d776313..d089f23 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h

@@ -12,6 +12,7 @@
 #define VP9_COMMON_VP9_ENUMS_H_
 
 #include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,29 +41,22 @@
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-typedef enum BIT_DEPTH {
-  BITS_8,
-  BITS_10,
-  BITS_12
-} BIT_DEPTH;
-
-typedef enum BLOCK_SIZE {
-  BLOCK_4X4,
-  BLOCK_4X8,
-  BLOCK_8X4,
-  BLOCK_8X8,
-  BLOCK_8X16,
-  BLOCK_16X8,
-  BLOCK_16X16,
-  BLOCK_16X32,
-  BLOCK_32X16,
-  BLOCK_32X32,
-  BLOCK_32X64,
-  BLOCK_64X32,
-  BLOCK_64X64,
-  BLOCK_SIZES,
-  BLOCK_INVALID = BLOCK_SIZES
-} BLOCK_SIZE;
+#define BLOCK_4X4     0
+#define BLOCK_4X8     1
+#define BLOCK_8X4     2
+#define BLOCK_8X8     3
+#define BLOCK_8X16    4
+#define BLOCK_16X8    5
+#define BLOCK_16X16   6
+#define BLOCK_16X32   7
+#define BLOCK_32X16   8
+#define BLOCK_32X32   9
+#define BLOCK_32X64  10
+#define BLOCK_64X32  11
+#define BLOCK_64X64  12
+#define BLOCK_SIZES  13
+#define BLOCK_INVALID BLOCK_SIZES
+typedef uint8_t BLOCK_SIZE;
 
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,
@@ -73,17 +67,17 @@
   PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
 
+typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
 // block transform size
-typedef enum {
-  TX_4X4 = 0,                      // 4x4 transform
-  TX_8X8 = 1,                      // 8x8 transform
-  TX_16X16 = 2,                    // 16x16 transform
-  TX_32X32 = 3,                    // 32x32 transform
-  TX_SIZES
-} TX_SIZE;
+typedef uint8_t TX_SIZE;
+#define TX_4X4   ((TX_SIZE)0)   // 4x4 transform
+#define TX_8X8   ((TX_SIZE)1)   // 8x8 transform
+#define TX_16X16 ((TX_SIZE)2)   // 16x16 transform
+#define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
+#define TX_SIZES ((TX_SIZE)4)
 
 // frame transform mode
 typedef enum {
@@ -104,22 +98,48 @@
 } TX_TYPE;
 
 typedef enum {
-  UNKNOWN    = 0,
-  BT_601     = 1,  // YUV
-  BT_709     = 2,  // YUV
-  SMPTE_170  = 3,  // YUV
-  SMPTE_240  = 4,  // YUV
-  RESERVED_1 = 5,
-  RESERVED_2 = 6,
-  SRGB       = 7   // RGB
-} COLOR_SPACE;
-
-typedef enum {
   VP9_LAST_FLAG = 1 << 0,
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
 } VP9_REFFRAME;
 
+typedef enum {
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
+} PLANE_TYPE;
+
+#define DC_PRED    0       // Average of above and left pixels
+#define V_PRED     1       // Vertical
+#define H_PRED     2       // Horizontal
+#define D45_PRED   3       // Directional 45  deg = round(arctan(1/1) * 180/pi)
+#define D135_PRED  4       // Directional 135 deg = 180 - 45
+#define D117_PRED  5       // Directional 117 deg = 180 - 63
+#define D153_PRED  6       // Directional 153 deg = 180 - 27
+#define D207_PRED  7       // Directional 207 deg = 180 + 27
+#define D63_PRED   8       // Directional 63  deg = round(arctan(2/1) * 180/pi)
+#define TM_PRED    9       // True-motion
+#define NEARESTMV 10
+#define NEARMV    11
+#define ZEROMV    12
+#define NEWMV     13
+#define MB_MODE_COUNT 14
+typedef uint8_t PREDICTION_MODE;
+
+#define INTRA_MODES (TM_PRED + 1)
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c
index afcdf22..4b2198f 100644
--- a/libvpx/vp9/common/vp9_filter.c
+++ b/libvpx/vp9/common/vp9_filter.c

@@ -12,7 +12,8 @@
 
 #include "vp9/common/vp9_filter.h"
 
-const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -32,8 +33,8 @@
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -53,8 +54,8 @@
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -74,8 +75,8 @@
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -95,15 +96,9 @@
 };
 
 
-static const InterpKernel* vp9_filter_kernels[4] = {
-  vp9_sub_pel_filters_8,
-  vp9_sub_pel_filters_8lp,
-  vp9_sub_pel_filters_8s,
-  vp9_bilinear_filters
+const InterpKernel *vp9_filter_kernels[4] = {
+  sub_pel_filters_8,
+  sub_pel_filters_8lp,
+  sub_pel_filters_8s,
+  bilinear_filters
 };
-
-const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
-  assert(filter != SWITCHABLE);
-  return vp9_filter_kernels[filter];
-}
-

diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h
index 8c359c7..efa24bc 100644
--- a/libvpx/vp9/common/vp9_filter.h
+++ b/libvpx/vp9/common/vp9_filter.h

@@ -13,6 +13,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 
@@ -20,32 +21,19 @@
 extern "C" {
 #endif
 
-#define FILTER_BITS 7
+#define EIGHTTAP            0
+#define EIGHTTAP_SMOOTH     1
+#define EIGHTTAP_SHARP      2
+#define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
+#define BILINEAR            3
+// The codec can operate in four possible inter prediction filter mode:
+// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#define SWITCHABLE 4 /* should be the last one */
 
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
+typedef uint8_t INTERP_FILTER;
 
-typedef enum {
-  EIGHTTAP = 0,
-  EIGHTTAP_SMOOTH = 1,
-  EIGHTTAP_SHARP = 2,
-  BILINEAR = 3,
-  SWITCHABLE = 4  /* should be the last one */
-} INTERP_FILTER;
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
-
-DECLARE_ALIGNED(256, extern const InterpKernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]);
-
-// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
-// filter kernel as a 2 tap filter.
-#define BILINEAR_FILTERS_2TAP(x) \
-  (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1)
+extern const InterpKernel *vp9_filter_kernels[4];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/common/vp9_frame_buffers.c b/libvpx/vp9/common/vp9_frame_buffers.c
index 733b3a9..0f41d66 100644
--- a/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/libvpx/vp9/common/vp9_frame_buffers.c

@@ -61,6 +61,10 @@
     if (!int_fb_list->int_fb[i].data)
       return -1;
 
+    // This memset is needed for fixing valgrind error from C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // removed if border is totally removed.
+    memset(int_fb_list->int_fb[i].data, 0, min_size);
     int_fb_list->int_fb[i].size = min_size;
   }
 

diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index 856d41e..d12cd76 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c

@@ -8,291 +8,28 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
 #include <math.h>
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
 
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-   0.5 shifts per pixel. */
-  int i;
-  int16_t output[16];
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = a1;
-    op[1] = b1;
-    op[2] = c1;
-    op[3] = d1;
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
-    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
-    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
-    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
-
-    ip++;
-    dest++;
-  }
-}
-
-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  int a1, e1;
-  int16_t tmp[4];
-  const int16_t *ip = in;
-  int16_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = a1;
-  op[1] = op[2] = op[3] = e1;
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
-    ip++;
-    dest++;
-  }
-}
-
-static void idct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = dct_const_round_shift(temp1);
-  step[1] = dct_const_round_shift(temp2);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = dct_const_round_shift(temp1);
-  step[3] = dct_const_round_shift(temp2);
-
-  // stage 2
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = step[1] - step[2];
-  output[3] = step[0] - step[3];
-}
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    idct4(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    idct4(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
-  int i;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel(dest[0] + a1);
-    dest[1] = clip_pixel(dest[1] + a1);
-    dest[2] = clip_pixel(dest[2] + a1);
-    dest[3] = clip_pixel(dest[3] + a1);
-    dest += dest_stride;
-  }
-}
-
-static void idct8(const int16_t *input, int16_t *output) {
-  int16_t step1[8], step2[8];
-  int temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-
-  // stage 2 & stage 3 - even half
-  idct4(step1, step1);
-
-  // stage 2 - odd half
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
-
-  // stage 3 -odd half
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = step1[0] + step1[7];
-  output[1] = step1[1] + step1[6];
-  output[2] = step1[2] + step1[5];
-  output[3] = step1[3] + step1[4];
-  output[4] = step1[3] - step1[4];
-  output[5] = step1[2] - step1[5];
-  output[6] = step1[1] - step1[6];
-  output[7] = step1[0] - step1[7];
-}
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    idct8(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    idct8(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
-    dest += stride;
-  }
-}
-
-static void iadst4(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  int x0 = input[0];
-  int x1 = input[1];
-  int x2 = input[2];
-  int x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
-}
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
-    { idct4, idct4  },  // DCT_DCT  = 0
-    { iadst4, idct4  },   // ADST_DCT = 1
-    { idct4, iadst4 },    // DCT_ADST = 2
-    { iadst4, iadst4 }      // ADST_ADST = 3
+    { idct4_c, idct4_c  },  // DCT_DCT  = 0
+    { iadst4_c, idct4_c  },   // ADST_DCT = 1
+    { idct4_c, iadst4_c },    // DCT_ADST = 2
+    { iadst4_c, iadst4_c }      // ADST_ADST = 3
   };
 
   int i, j;
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
-  int16_t temp_in[4], temp_out[4];
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
@@ -306,101 +43,26 @@
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
   }
 }
-static void iadst8(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = dct_const_round_shift(s0 + s4);
-  x1 = dct_const_round_shift(s1 + s5);
-  x2 = dct_const_round_shift(s2 + s6);
-  x3 = dct_const_round_shift(s3 + s7);
-  x4 = dct_const_round_shift(s0 - s4);
-  x5 = dct_const_round_shift(s1 - s5);
-  x6 = dct_const_round_shift(s2 - s6);
-  x7 = dct_const_round_shift(s3 - s7);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-
-  output[0] =  x0;
-  output[1] = -x4;
-  output[2] =  x6;
-  output[3] = -x2;
-  output[4] =  x3;
-  output[5] = -x7;
-  output[6] =  x5;
-  output[7] = -x1;
-}
 
 static const transform_2d IHT_8[] = {
-  { idct8,  idct8  },  // DCT_DCT  = 0
-  { iadst8, idct8  },  // ADST_DCT = 1
-  { idct8,  iadst8 },  // DCT_ADST = 2
-  { iadst8, iadst8 }   // ADST_ADST = 3
+  { idct8_c,  idct8_c  },  // DCT_DCT  = 0
+  { iadst8_c, idct8_c  },  // ADST_DCT = 1
+  { idct8_c,  iadst8_c },  // DCT_ADST = 2
+  { iadst8_c, iadst8_c }   // ADST_ADST = 3
 };
 
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   int i, j;
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];
 
   // inverse transform row vectors
@@ -415,410 +77,26 @@
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
   }
 }
 
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8] = { 0 };
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    idct8(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    idct8(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * stride + i]);
-  }
-}
-
-static void idct16(const int16_t *input, int16_t *output) {
-  int16_t step1[16], step2[16];
-  int temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0/2];
-  step1[1] = input[16/2];
-  step1[2] = input[8/2];
-  step1[3] = input[24/2];
-  step1[4] = input[4/2];
-  step1[5] = input[20/2];
-  step1[6] = input[12/2];
-  step1[7] = input[28/2];
-  step1[8] = input[2/2];
-  step1[9] = input[18/2];
-  step1[10] = input[10/2];
-  step1[11] = input[26/2];
-  step1[12] = input[6/2];
-  step1[13] = input[22/2];
-  step1[14] = input[14/2];
-  step1[15] = input[30/2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = dct_const_round_shift(temp1);
-  step2[15] = dct_const_round_shift(temp2);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-
-  step1[8] = step2[8] + step2[9];
-  step1[9] = step2[8] - step2[9];
-  step1[10] = -step2[10] + step2[11];
-  step1[11] = step2[10] + step2[11];
-  step1[12] = step2[12] + step2[13];
-  step1[13] = step2[12] - step2[13];
-  step1[14] = -step2[14] + step2[15];
-  step1[15] = step2[14] + step2[15];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = dct_const_round_shift(temp1);
-  step2[1] = dct_const_round_shift(temp2);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = step2[0] + step2[3];
-  step1[1] = step2[1] + step2[2];
-  step1[2] = step2[1] - step2[2];
-  step1[3] = step2[0] - step2[3];
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-  step1[7] = step2[7];
-
-  step1[8] = step2[8] + step2[11];
-  step1[9] = step2[9] + step2[10];
-  step1[10] = step2[9] - step2[10];
-  step1[11] = step2[8] - step2[11];
-  step1[12] = -step2[12] + step2[15];
-  step1[13] = -step2[13] + step2[14];
-  step1[14] = step2[13] + step2[14];
-  step1[15] = step2[12] + step2[15];
-
-  // stage 6
-  step2[0] = step1[0] + step1[7];
-  step2[1] = step1[1] + step1[6];
-  step2[2] = step1[2] + step1[5];
-  step2[3] = step1[3] + step1[4];
-  step2[4] = step1[3] - step1[4];
-  step2[5] = step1[2] - step1[5];
-  step2[6] = step1[1] - step1[6];
-  step2[7] = step1[0] - step1[7];
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = step2[0] + step2[15];
-  output[1] = step2[1] + step2[14];
-  output[2] = step2[2] + step2[13];
-  output[3] = step2[3] + step2[12];
-  output[4] = step2[4] + step2[11];
-  output[5] = step2[5] + step2[10];
-  output[6] = step2[6] + step2[9];
-  output[7] = step2[7] + step2[8];
-  output[8] = step2[7] - step2[8];
-  output[9] = step2[6] - step2[9];
-  output[10] = step2[5] - step2[10];
-  output[11] = step2[4] - step2[11];
-  output[12] = step2[3] - step2[12];
-  output[13] = step2[2] - step2[13];
-  output[14] = step2[1] - step2[14];
-  output[15] = step2[0] - step2[15];
-}
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    idct16(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    idct16(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
-  }
-}
-
-static void iadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
-
-  output[0] =  x0;
-  output[1] = -x8;
-  output[2] =  x12;
-  output[3] = -x4;
-  output[4] =  x6;
-  output[5] =  x14;
-  output[6] =  x10;
-  output[7] =  x2;
-  output[8] =  x3;
-  output[9] =  x11;
-  output[10] =  x15;
-  output[11] =  x7;
-  output[12] =  x5;
-  output[13] = -x13;
-  output[14] =  x9;
-  output[15] = -x1;
-}
-
 static const transform_2d IHT_16[] = {
-  { idct16,  idct16  },  // DCT_DCT  = 0
-  { iadst16, idct16  },  // ADST_DCT = 1
-  { idct16,  iadst16 },  // DCT_ADST = 2
-  { iadst16, iadst16 }   // ADST_ADST = 3
+  { idct16_c,  idct16_c  },  // DCT_DCT  = 0
+  { iadst16_c, idct16_c  },  // ADST_DCT = 1
+  { idct16_c,  iadst16_c },  // DCT_ADST = 2
+  { iadst16_c, iadst16_c }   // ADST_ADST = 3
 };
 
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   int i, j;
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];
 
   // Rows
@@ -833,511 +111,33 @@
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                        + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16] = { 0 };
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    idct16(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
-    idct16(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
-    dest += stride;
-  }
-}
-
-static void idct32(const int16_t *input, int16_t *output) {
-  int16_t step1[32], step2[32];
-  int temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = dct_const_round_shift(temp1);
-  step1[31] = dct_const_round_shift(temp2);
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = dct_const_round_shift(temp1);
-  step1[30] = dct_const_round_shift(temp2);
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = dct_const_round_shift(temp1);
-  step1[28] = dct_const_round_shift(temp2);
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = dct_const_round_shift(temp1);
-  step1[24] = dct_const_round_shift(temp2);
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = dct_const_round_shift(temp1);
-  step2[15] = dct_const_round_shift(temp2);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
-
-  step2[16] = step1[16] + step1[17];
-  step2[17] = step1[16] - step1[17];
-  step2[18] = -step1[18] + step1[19];
-  step2[19] = step1[18] + step1[19];
-  step2[20] = step1[20] + step1[21];
-  step2[21] = step1[20] - step1[21];
-  step2[22] = -step1[22] + step1[23];
-  step2[23] = step1[22] + step1[23];
-  step2[24] = step1[24] + step1[25];
-  step2[25] = step1[24] - step1[25];
-  step2[26] = -step1[26] + step1[27];
-  step2[27] = step1[26] + step1[27];
-  step2[28] = step1[28] + step1[29];
-  step2[29] = step1[28] - step1[29];
-  step2[30] = -step1[30] + step1[31];
-  step2[31] = step1[30] + step1[31];
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = dct_const_round_shift(temp1);
-  step1[7] = dct_const_round_shift(temp2);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-
-  step1[8] = step2[8] + step2[9];
-  step1[9] = step2[8] - step2[9];
-  step1[10] = -step2[10] + step2[11];
-  step1[11] = step2[10] + step2[11];
-  step1[12] = step2[12] + step2[13];
-  step1[13] = step2[12] - step2[13];
-  step1[14] = -step2[14] + step2[15];
-  step1[15] = step2[14] + step2[15];
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = dct_const_round_shift(temp1);
-  step1[30] = dct_const_round_shift(temp2);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = dct_const_round_shift(temp1);
-  step2[1] = dct_const_round_shift(temp2);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
-  step2[4] = step1[4] + step1[5];
-  step2[5] = step1[4] - step1[5];
-  step2[6] = -step1[6] + step1[7];
-  step2[7] = step1[6] + step1[7];
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = dct_const_round_shift(temp1);
-  step2[14] = dct_const_round_shift(temp2);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = step1[16] + step1[19];
-  step2[17] = step1[17] + step1[18];
-  step2[18] = step1[17] - step1[18];
-  step2[19] = step1[16] - step1[19];
-  step2[20] = -step1[20] + step1[23];
-  step2[21] = -step1[21] + step1[22];
-  step2[22] = step1[21] + step1[22];
-  step2[23] = step1[20] + step1[23];
-
-  step2[24] = step1[24] + step1[27];
-  step2[25] = step1[25] + step1[26];
-  step2[26] = step1[25] - step1[26];
-  step2[27] = step1[24] - step1[27];
-  step2[28] = -step1[28] + step1[31];
-  step2[29] = -step1[29] + step1[30];
-  step2[30] = step1[29] + step1[30];
-  step2[31] = step1[28] + step1[31];
-
-  // stage 5
-  step1[0] = step2[0] + step2[3];
-  step1[1] = step2[1] + step2[2];
-  step1[2] = step2[1] - step2[2];
-  step1[3] = step2[0] - step2[3];
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = dct_const_round_shift(temp1);
-  step1[6] = dct_const_round_shift(temp2);
-  step1[7] = step2[7];
-
-  step1[8] = step2[8] + step2[11];
-  step1[9] = step2[9] + step2[10];
-  step1[10] = step2[9] - step2[10];
-  step1[11] = step2[8] - step2[11];
-  step1[12] = -step2[12] + step2[15];
-  step1[13] = -step2[13] + step2[14];
-  step1[14] = step2[13] + step2[14];
-  step1[15] = step2[12] + step2[15];
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = dct_const_round_shift(temp1);
-  step1[29] = dct_const_round_shift(temp2);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = dct_const_round_shift(temp1);
-  step1[28] = dct_const_round_shift(temp2);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = step1[0] + step1[7];
-  step2[1] = step1[1] + step1[6];
-  step2[2] = step1[2] + step1[5];
-  step2[3] = step1[3] + step1[4];
-  step2[4] = step1[3] - step1[4];
-  step2[5] = step1[2] - step1[5];
-  step2[6] = step1[1] - step1[6];
-  step2[7] = step1[0] - step1[7];
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = dct_const_round_shift(temp1);
-  step2[13] = dct_const_round_shift(temp2);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = dct_const_round_shift(temp1);
-  step2[12] = dct_const_round_shift(temp2);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = step1[16] + step1[23];
-  step2[17] = step1[17] + step1[22];
-  step2[18] = step1[18] + step1[21];
-  step2[19] = step1[19] + step1[20];
-  step2[20] = step1[19] - step1[20];
-  step2[21] = step1[18] - step1[21];
-  step2[22] = step1[17] - step1[22];
-  step2[23] = step1[16] - step1[23];
-
-  step2[24] = -step1[24] + step1[31];
-  step2[25] = -step1[25] + step1[30];
-  step2[26] = -step1[26] + step1[29];
-  step2[27] = -step1[27] + step1[28];
-  step2[28] = step1[27] + step1[28];
-  step2[29] = step1[26] + step1[29];
-  step2[30] = step1[25] + step1[30];
-  step2[31] = step1[24] + step1[31];
-
-  // stage 7
-  step1[0] = step2[0] + step2[15];
-  step1[1] = step2[1] + step2[14];
-  step1[2] = step2[2] + step2[13];
-  step1[3] = step2[3] + step2[12];
-  step1[4] = step2[4] + step2[11];
-  step1[5] = step2[5] + step2[10];
-  step1[6] = step2[6] + step2[9];
-  step1[7] = step2[7] + step2[8];
-  step1[8] = step2[7] - step2[8];
-  step1[9] = step2[6] - step2[9];
-  step1[10] = step2[5] - step2[10];
-  step1[11] = step2[4] - step2[11];
-  step1[12] = step2[3] - step2[12];
-  step1[13] = step2[2] - step2[13];
-  step1[14] = step2[1] - step2[14];
-  step1[15] = step2[0] - step2[15];
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = dct_const_round_shift(temp1);
-  step1[27] = dct_const_round_shift(temp2);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = dct_const_round_shift(temp1);
-  step1[26] = dct_const_round_shift(temp2);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = dct_const_round_shift(temp1);
-  step1[25] = dct_const_round_shift(temp2);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = dct_const_round_shift(temp1);
-  step1[24] = dct_const_round_shift(temp2);
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = step1[0] + step1[31];
-  output[1] = step1[1] + step1[30];
-  output[2] = step1[2] + step1[29];
-  output[3] = step1[3] + step1[28];
-  output[4] = step1[4] + step1[27];
-  output[5] = step1[5] + step1[26];
-  output[6] = step1[6] + step1[25];
-  output[7] = step1[7] + step1[24];
-  output[8] = step1[8] + step1[23];
-  output[9] = step1[9] + step1[22];
-  output[10] = step1[10] + step1[21];
-  output[11] = step1[11] + step1[20];
-  output[12] = step1[12] + step1[19];
-  output[13] = step1[13] + step1[18];
-  output[14] = step1[14] + step1[17];
-  output[15] = step1[15] + step1[16];
-  output[16] = step1[15] - step1[16];
-  output[17] = step1[14] - step1[17];
-  output[18] = step1[13] - step1[18];
-  output[19] = step1[12] - step1[19];
-  output[20] = step1[11] - step1[20];
-  output[21] = step1[10] - step1[21];
-  output[22] = step1[9] - step1[22];
-  output[23] = step1[8] - step1[23];
-  output[24] = step1[7] - step1[24];
-  output[25] = step1[6] - step1[25];
-  output[26] = step1[5] - step1[26];
-  output[27] = step1[4] - step1[27];
-  output[28] = step1[3] - step1[28];
-  output[29] = step1[2] - step1[29];
-  output[30] = step1[1] - step1[30];
-  output[31] = step1[0] - step1[31];
-}
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32];
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j)
-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      idct32(input, outptr);
-    else
-      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                        + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32] = {0};
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    idct32(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
-  }
-}
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  int a1;
-
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i)
-      dest[i] = clip_pixel(dest[i] + a1);
-    dest += stride;
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
   }
 }
 
 // idct
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   if (eob > 1)
-    vp9_idct4x4_16_add(input, dest, stride);
+    vpx_idct4x4_16_add(input, dest, stride);
   else
-    vp9_idct4x4_1_add(input, dest, stride);
+    vpx_idct4x4_1_add(input, dest, stride);
 }
 
 
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   if (eob > 1)
-    vp9_iwht4x4_16_add(input, dest, stride);
+    vpx_iwht4x4_16_add(input, dest, stride);
   else
-    vp9_iwht4x4_1_add(input, dest, stride);
+    vpx_iwht4x4_1_add(input, dest, stride);
 }
 
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -1347,39 +147,39 @@
   // Combine that with code here.
   if (eob == 1)
     // DC only DCT coefficient
-    vp9_idct8x8_1_add(input, dest, stride);
+    vpx_idct8x8_1_add(input, dest, stride);
   else if (eob <= 12)
-    vp9_idct8x8_12_add(input, dest, stride);
+    vpx_idct8x8_12_add(input, dest, stride);
   else
-    vp9_idct8x8_64_add(input, dest, stride);
+    vpx_idct8x8_64_add(input, dest, stride);
 }
 
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1)
     /* DC only DCT coefficient. */
-    vp9_idct16x16_1_add(input, dest, stride);
+    vpx_idct16x16_1_add(input, dest, stride);
   else if (eob <= 10)
-    vp9_idct16x16_10_add(input, dest, stride);
+    vpx_idct16x16_10_add(input, dest, stride);
   else
-    vp9_idct16x16_256_add(input, dest, stride);
+    vpx_idct16x16_256_add(input, dest, stride);
 }
 
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   if (eob == 1)
-    vp9_idct32x32_1_add(input, dest, stride);
+    vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
     // non-zero coeff only in upper-left 8x8
-    vp9_idct32x32_34_add(input, dest, stride);
+    vpx_idct32x32_34_add(input, dest, stride);
   else
-    vp9_idct32x32_1024_add(input, dest, stride);
+    vpx_idct32x32_1024_add(input, dest, stride);
 }
 
 // iht
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT)
     vp9_idct4x4_add(input, dest, stride, eob);
@@ -1387,7 +187,7 @@
     vp9_iht4x4_16_add(input, dest, stride, tx_type);
 }
 
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
@@ -1396,7 +196,7 @@
   }
 }
 
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
@@ -1404,3 +204,199 @@
     vp9_iht16x16_256_add(input, dest, stride, tx_type);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  const highbd_transform_2d IHT_4[] = {
+    { vpx_highbd_idct4_c, vpx_highbd_idct4_c  },    // DCT_DCT  = 0
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },    // ADST_DCT = 1
+    { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },    // DCT_ADST = 2
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }    // ADST_ADST = 3
+  };
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr, bd);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_8[] = {
+  { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }   // ADST_ADST = 3
+};
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_16[] = {
+  { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }   // ADST_ADST = 3
+};
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+// idct
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  // DC only DCT coefficient
+  if (eob == 1) {
+    vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to separate different cases.
+  // DC only DCT coefficient.
+  if (eob == 1) {
+    vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // Non-zero coeff only in upper-left 8x8
+  if (eob == 1) {
+    vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
+  } else if (eob <= 34) {
+    vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
+  }
+}
+
+// iht
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT)
+    vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
+  else
+    vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+  }
+}
+
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 7f595e1..b5a3fbf 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h

@@ -14,109 +14,66 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-// Constants and Macros used by all idct/dct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
-
-#define UNIT_QUANT_SHIFT 2
-#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
-
-#define pair_set_epi16(a, b) \
-  _mm_set_epi16(b, a, b, a, b, a, b, a)
-
-#define dual_set_epi16(a, b) \
-  _mm_set_epi16(b, b, b, b, a, a, a, a)
-
-// Constants:
-//  for (int i = 1; i< 32; ++i)
-//    printf("static const int cospi_%d_64 = %.0f;\n", i,
-//           round(16384 * cos(i*M_PI/64)));
-// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const int cospi_1_64  = 16364;
-static const int cospi_2_64  = 16305;
-static const int cospi_3_64  = 16207;
-static const int cospi_4_64  = 16069;
-static const int cospi_5_64  = 15893;
-static const int cospi_6_64  = 15679;
-static const int cospi_7_64  = 15426;
-static const int cospi_8_64  = 15137;
-static const int cospi_9_64  = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
-
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const int sinpi_1_9 = 5283;
-static const int sinpi_2_9 = 9929;
-static const int sinpi_3_9 = 13377;
-static const int sinpi_4_9 = 15212;
-
-static INLINE int dct_const_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid VP9 input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt VP9 streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  assert(INT16_MIN <= rv);
-  assert(rv <= INT16_MAX);
-#endif
-  return (int16_t)rv;
-}
-
-typedef void (*transform_1d)(const int16_t*, int16_t*);
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
 
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
-                       eob);
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+typedef struct {
+  highbd_transform_1d cols, rows;  // vertical and horizontal
+} highbd_transform_2d;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob);
 
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob);
 
-
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                             uint8_t *dest, int stride, int eob, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 3b39d42..0915918 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c

@@ -9,10 +9,12 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
@@ -34,10 +36,10 @@
 //
 // A loopfilter should be applied to every other 8x8 horizontally.
 static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x5555555555555555,  // TX_16x16
-  0x1111111111111111,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
 };
 
 // 64 bit masks for above transform size. Each 1 represents a position where
@@ -58,10 +60,10 @@
 //
 // A loopfilter should be applied to every other 4 the row vertically.
 static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x00ff00ff00ff00ff,  // TX_16x16
-  0x000000ff000000ff,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
 };
 
 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@@ -80,59 +82,59 @@
 //  00000000
 //  00000000
 static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4,
-  0x0000000000000001,  // BLOCK_4X8,
-  0x0000000000000001,  // BLOCK_8X4,
-  0x0000000000000001,  // BLOCK_8X8,
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000001,  // BLOCK_16X8,
-  0x0000000000000101,  // BLOCK_16X16,
-  0x0000000001010101,  // BLOCK_16X32,
-  0x0000000000000101,  // BLOCK_32X16,
-  0x0000000001010101,  // BLOCK_32X32,
-  0x0101010101010101,  // BLOCK_32X64,
-  0x0000000001010101,  // BLOCK_64X32,
-  0x0101010101010101,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
 };
 
 // 64 bit mask to shift and set for each prediction size.
 static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000001,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000003,  // BLOCK_16X16
-  0x0000000000000003,  // BLOCK_16X32,
-  0x000000000000000f,  // BLOCK_32X16,
-  0x000000000000000f,  // BLOCK_32X32,
-  0x000000000000000f,  // BLOCK_32X64,
-  0x00000000000000ff,  // BLOCK_64X32,
-  0x00000000000000ff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
 };
 // 64 bit mask to shift and set for each prediction size. A bit is set for
 // each 8x8 block that would be in the left most block of the given block
 // size in the 64x64 block.
 static const uint64_t size_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000303,  // BLOCK_16X16
-  0x0000000003030303,  // BLOCK_16X32,
-  0x0000000000000f0f,  // BLOCK_32X16,
-  0x000000000f0f0f0f,  // BLOCK_32X32,
-  0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
-  0x00000000ffffffff,  // BLOCK_64X32,
-  0xffffffffffffffff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
 };
 
 // These are used for masking the left and above borders.
-static const uint64_t left_border =  0x1111111111111111;
-static const uint64_t above_border = 0x000000ff000000ff;
+static const uint64_t left_border =  0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
 
 // 16 bit masks for uv transform sizes.
 static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
@@ -222,9 +224,9 @@
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
-               SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
   }
 }
 
@@ -245,7 +247,7 @@
 
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
-    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -266,8 +268,8 @@
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     int lvl_seg = default_filt_lvl;
-    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
                       data : default_filt_lvl + data,
                       0, MAX_LOOP_FILTER);
@@ -276,7 +278,7 @@
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+      memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
     } else {
       int ref, mode;
       const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
@@ -293,7 +295,7 @@
   }
 }
 
-static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
                                          unsigned int mask_16x16_l,
                                          unsigned int mask_8x8_l,
@@ -301,9 +303,9 @@
                                          unsigned int mask_4x4_int_l,
                                          const loop_filter_info_n *lfi_n,
                                          const uint8_t *lfl) {
-  const int mask_shift = plane_type ? 4 : 8;
-  const int mask_cutoff = plane_type ? 0xf : 0xff;
-  const int lfl_forward = plane_type ? 4 : 8;
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
 
   unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
   unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -325,55 +327,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                    lfi0->hev_thr);
         } else if (mask_16x16_0 & 1) {
-          vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                               lfi0->hev_thr);
         } else {
-          vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                               lfi1->lim, lfi1->hev_thr);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                              lfi0->hev_thr, 1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
@@ -392,6 +394,107 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
+                                                uint16_t *s, int pitch,
+                                                unsigned int mask_16x16_l,
+                                                unsigned int mask_8x8_l,
+                                                unsigned int mask_4x4_l,
+                                                unsigned int mask_4x4_int_l,
+                                                const loop_filter_info_n *lfi_n,
+                                                const uint8_t *lfl, int bd) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_16x16,
                                      unsigned int mask_8x8,
@@ -410,72 +513,72 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 2);
           count = 2;
         } else {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
-          // Next block's thresholds
+          // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4_int & 1) {
-        vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr, 1);
       }
     }
@@ -488,6 +591,112 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
+                                            unsigned int mask_16x16,
+                                            unsigned int mask_8x8,
+                                            unsigned int mask_4x4,
+                                            unsigned int mask_4x4_int,
+                                            const loop_filter_info_n *lfi_n,
+                                            const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2, bd);
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // This function ors into the current lfm structure, where to do loop
 // filters for the specific mi we are looking at. It uses information
 // including the block_size_type (32x16, 32x32, etc.), the transform size,
@@ -520,7 +729,7 @@
     const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
-      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      memset(&lfm->lfl_y[index], filter_level, w);
       index += 8;
     }
   }
@@ -566,7 +775,7 @@
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 
   if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
@@ -594,7 +803,7 @@
     const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
-      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      memset(&lfm->lfl_y[index], filter_level, w);
       index += 8;
     }
   }
@@ -612,7 +821,7 @@
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
@@ -761,7 +970,7 @@
       break;
   }
   // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also also.
+  // for 32x32 transforms also.
   lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
   lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
   lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
@@ -814,7 +1023,7 @@
 
     // Each pixel inside the border gets a 1, the multiply copies the border
     // to where we need it.
-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
     const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
 
     // Internal edges are not applied on the last column of the image so
@@ -846,7 +1055,7 @@
   // out.
   if (mi_col == 0) {
     for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
       lfm->left_uv[i] &= 0xeeee;
     }
   }
@@ -885,15 +1094,15 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       } else if (mask_4x4 & 1) {
-        vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -903,14 +1112,53 @@
   }
 }
 
-static void filter_block_plane_non420(VP9_COMMON *cm,
-                                      struct macroblockd_plane *plane,
-                                      MODE_INFO **mi_8x8,
-                                      int mi_row, int mi_col) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
+                                           unsigned int mask_16x16,
+                                           unsigned int mask_8x8,
+                                           unsigned int mask_4x4,
+                                           unsigned int mask_4x4_int,
+                                           const loop_filter_info_n *lfi_n,
+                                           const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, bd);
+      } else if (mask_8x8 & 1) {
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1, bd);
+      } else if (mask_4x4 & 1) {
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
   const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
@@ -1001,12 +1249,32 @@
 
     // Disable filtering on the leftmost column
     border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+                                     dst->stride,
+                                     mask_16x16_c & border_mask,
+                                     mask_8x8_c & border_mask,
+                                     mask_4x4_c & border_mask,
+                                     mask_4x4_int[r],
+                                     &cm->lf_info, &lfl[r << 3],
+                                     (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert(dst->buf, dst->stride,
+                              mask_16x16_c & border_mask,
+                              mask_8x8_c & border_mask,
+                              mask_4x4_c & border_mask,
+                              mask_4x4_int[r],
+                              &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_vert(dst->buf, dst->stride,
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
                             &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -1030,179 +1298,254 @@
       mask_8x8_r = mask_8x8[r];
       mask_4x4_r = mask_4x4[r];
     }
-
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int_r,
+                                      &cm->lf_info, &lfl[r << 3],
+                                      (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r << 3]);
+    }
+#else
     filter_selectively_horiz(dst->buf, dst->stride,
                              mask_16x16_r,
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
                              &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
 }
 
-void vp9_filter_block_plane(VP9_COMMON *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
-  uint8_t* const dst0 = dst->buf;
-  int r, c;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
 
-  if (!plane->plane_type) {
-    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
-    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
-    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
-    uint64_t mask_4x4_int = lfm->int_4x4_y;
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-      // Disable filtering on the leftmost column
-      filter_selectively_vert_row2(plane->plane_type,
-                                   dst->buf, dst->stride,
-                                   mask_16x16_l,
-                                   mask_8x8_l,
-                                   mask_4x4_l,
-                                   mask_4x4_int_l,
-                                   &cm->lf_info, &lfm->lfl_y[r << 3]);
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_vert_row2(
+        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 16;
+    mask_8x8 >>= 16;
+    mask_4x4 >>= 16;
+    mask_4x4_int >>= 16;
+  }
 
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 16;
-      mask_8x8 >>= 16;
-      mask_4x4 >>= 16;
-      mask_4x4_int >>= 16;
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_y[TX_16X16];
+  mask_8x8 = lfm->above_y[TX_8X8];
+  mask_4x4 = lfm->above_y[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_y;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
     }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_y[TX_16X16];
-    mask_8x8 = lfm->above_y[TX_8X8];
-    mask_4x4 = lfm->above_y[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                             &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
+  }
+}
 
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xff;
-        mask_8x8_r = mask_8x8 & 0xff;
-        mask_4x4_r = mask_4x4 & 0xff;
+void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+  uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+    if (plane->plane_type == 1) {
+      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
       }
+    }
 
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int & 0xff,
-                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
 
-      dst->buf += 8 * dst->stride;
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+      } else {
+        filter_selectively_vert_row2(
+            plane->subsampling_x, dst->buf, dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1]);
+      }
+#else
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      dst->buf += 16 * dst->stride;
       mask_16x16 >>= 8;
       mask_8x8 >>= 8;
       mask_4x4 >>= 8;
       mask_4x4_int >>= 8;
     }
-  } else {
-    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
-    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
-    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-    uint16_t mask_4x4_int = lfm->int_4x4_uv;
+  }
 
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-      if (plane->plane_type == 1) {
-        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-          lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
-                                                       (c << 1)];
-        }
-      }
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_uv[TX_16X16];
+  mask_8x8 = lfm->above_uv[TX_8X8];
+  mask_4x4 = lfm->above_uv[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_uv;
 
-      {
-        unsigned int mask_16x16_l = mask_16x16 & 0xff;
-        unsigned int mask_8x8_l = mask_8x8 & 0xff;
-        unsigned int mask_4x4_l = mask_4x4 & 0xff;
-        unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
 
-        // Disable filtering on the leftmost column
-        filter_selectively_vert_row2(plane->plane_type,
-                                     dst->buf, dst->stride,
-                                     mask_16x16_l,
-                                     mask_8x8_l,
-                                     mask_4x4_l,
-                                     mask_4x4_int_l,
-                                     &cm->lf_info, &lfm->lfl_uv[r << 1]);
-
-        dst->buf += 16 * dst->stride;
-        mask_16x16 >>= 8;
-        mask_8x8 >>= 8;
-        mask_4x4 >>= 8;
-        mask_4x4_int >>= 8;
-      }
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
     }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_uv[TX_16X16];
-    mask_8x8 = lfm->above_uv[TX_8X8];
-    mask_4x4 = lfm->above_uv[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_uv;
-
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
-      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
-          0 : (mask_4x4_int & 0xf);
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
-
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xf;
-        mask_8x8_r = mask_8x8 & 0xf;
-        mask_4x4_r = mask_4x4 & 0xf;
-      }
-
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int_r,
-                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
-
-      dst->buf += 8 * dst->stride;
-      mask_16x16 >>= 4;
-      mask_8x8 >>= 4;
-      mask_4x4 >>= 4;
-      mask_4x4_int >>= 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               &lfm->lfl_uv[r << 1]);
     }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                             &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 4;
+    mask_8x8 >>= 4;
+    mask_4x4 >>= 4;
+    mask_4x4_int >>= 4;
   }
 }
 
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           VP9_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
-                                 planes[1].subsampling_x == 1);
+  enum lf_path path;
   LOOP_FILTER_MASK lfm;
   int mi_row, mi_col;
 
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
@@ -1212,16 +1555,23 @@
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      if (use_420)
-        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                       &lfm);
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
 
-      for (plane = 0; plane < num_planes; ++plane) {
-        if (use_420)
-          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-        else
-          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                    mi_row, mi_col);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
       }
     }
   }
@@ -1247,9 +1597,19 @@
                        y_only);
 }
 
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
-  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
-  (void)arg2;
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                        lf_data->start, lf_data->stop, lf_data->y_only);
   return 1;

diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index 6fa2773..f7cbde6 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h

@@ -29,6 +29,12 @@
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      2
 
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
 struct loopfilter {
   int filter_level;
 
@@ -92,10 +98,20 @@
                     MODE_INFO **mi_8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm);
 
-void vp9_filter_block_plane(struct VP9Common *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm);
+void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss11(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col);
 
 void vp9_loop_filter_init(struct VP9Common *cm);
 
@@ -111,26 +127,27 @@
                            int y_only, int partial_frame);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           struct VP9Common *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only);
 
 typedef struct LoopFilterWorkerData {
-  const YV12_BUFFER_CONFIG *frame_buffer;
+  YV12_BUFFER_CONFIG *frame_buffer;
   struct VP9Common *cm;
   struct macroblockd_plane planes[MAX_MB_PLANE];
 
   int start;
   int stop;
   int y_only;
-
-  struct VP9LfSyncData *lf_sync;
-  int num_lf_workers;
 } LFWorkerData;
 
-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c
deleted file mode 100644
index 25d3311..0000000
--- a/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ /dev/null

@@ -1,339 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-static INLINE int8_t signed_char_clamp(int t) {
-  return (int8_t)clamp(t, -128, 127);
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
-                                 uint8_t p3, uint8_t p2,
-                                 uint8_t p1, uint8_t p0,
-                                 uint8_t q0, uint8_t q1,
-                                 uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p3 - p2) > limit) * -1;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh,
-                                uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0,
-                                uint8_t q0, uint8_t q1,
-                                uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  mask |= (abs(p3 - p0) > thresh) * -1;
-  mask |= (abs(q3 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask5(uint8_t thresh,
-                                uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1,
-                                uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2,
-                                uint8_t q3, uint8_t q4) {
-  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                              uint8_t q0, uint8_t q1) {
-  int8_t hev = 0;
-  hev  |= (abs(p1 - p0) > thresh) * -1;
-  hev  |= (abs(q1 - q0) > thresh) * -1;
-  return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-
-  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
-  // add outer taps if we have high edge variance
-  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-  // inner taps
-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-
-  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-
-  // outer tap adjustments
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
-}
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh, int count) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  }
-}
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
-    s += pitch;
-  }
-}
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                  thresh1, 1);
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
-                           uint8_t *op3, uint8_t *op2,
-                           uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1,
-                           uint8_t *oq2, uint8_t *oq3) {
-  if (flat && mask) {
-    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    filter4(mask, thresh, op1,  op0, oq0, oq1);
-  }
-}
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh,
-                            int count) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
-    ++s;
-  }
-}
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
-  int i;
-
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
-                                 s,     s + 1, s + 2, s + 3);
-    s += pitch;
-  }
-}
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                    thresh1, 1);
-}
-
-static INLINE void filter16(int8_t mask, uint8_t thresh,
-                            uint8_t flat, uint8_t flat2,
-                            uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5,
-                            uint8_t *oq6, uint8_t *oq7) {
-  if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
-                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
-                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4, 4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5, 4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
-    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
-    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int count) {
-  int i;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1,
-                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
-                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2,
-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
-    ++s;
-  }
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh,
-                                   int count) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                    q0, s[4], s[5], s[6], s[7]);
-
-    filter16(mask, *thresh, flat, flat2,
-             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
-    s += p;
-  }
-}
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
-}
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
-}

diff --git a/libvpx/vp9/common/vp9_mfqe.c b/libvpx/vp9/common/vp9_mfqe.c
new file mode 100644
index 0000000..6d560f4
--- /dev/null
+++ b/libvpx/vp9/common/vp9_mfqe.c

@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             int block_size, int src_weight) {
+  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+  int r, c;
+
+  for (r = 0; r < block_size; r++) {
+    for (c = 0; c < block_size; c++) {
+      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+               >> MFQE_PRECISION;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride, int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+                            weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16 + 16, dst_stride, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+  filter_by_weight32x32(src + 32, src_stride, dst + 32,
+                        dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32, src_stride,
+                        dst + dst_stride * 32, dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+                        dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+                          int yd_stride, const uint8_t *u, const uint8_t *v,
+                          int uv_stride, uint8_t *ud, uint8_t *vd,
+                          int uvd_stride, BLOCK_SIZE block_size,
+                          int weight) {
+  if (block_size == BLOCK_16X16) {
+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_32X32) {
+    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_64X64) {
+    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+  }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+                        uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 8; r++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 16; r++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem16x16(src, src_stride, dst, dst_stride);
+  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16, src_stride,
+                dst + dst_stride * 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+                dst + dst_stride * 16 + 16, dst_stride);
+}
+
+static void copy_mem64x64(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem32x32(src, src_stride, dst, dst_stride);
+  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32, src_stride,
+                dst + src_stride * 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+                dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+                       uint8_t *vd, int yd_stride, int uvd_stride,
+                       BLOCK_SIZE bs) {
+  if (bs == BLOCK_16X16) {
+    copy_mem16x16(y, y_stride, yd, yd_stride);
+    copy_mem8x8(u, uv_stride, ud, uvd_stride);
+    copy_mem8x8(v, uv_stride, vd, uvd_stride);
+  } else if (bs == BLOCK_32X32) {
+    copy_mem32x32(y, y_stride, yd, yd_stride);
+    copy_mem16x16(u, uv_stride, ud, uvd_stride);
+    copy_mem16x16(v, uv_stride, vd, uvd_stride);
+  } else {
+    copy_mem64x64(y, y_stride, yd, yd_stride);
+    copy_mem32x32(u, uv_stride, ud, uvd_stride);
+    copy_mem32x32(v, uv_stride, vd, uvd_stride);
+  }
+}
+
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+  const int adj = qdiff >> MFQE_PRECISION;
+  if (bs == BLOCK_16X16) {
+    *sad_thr = 7 + adj;
+  } else if (bs == BLOCK_32X32) {
+    *sad_thr = 6 + adj;
+  } else {  // BLOCK_64X64
+    *sad_thr = 5 + adj;
+  }
+  *vdiff_thr = 125 + qdiff;
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+                       const uint8_t *v, int y_stride, int uv_stride,
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+                       int uvd_stride, int qdiff) {
+  int sad, sad_thr, vdiff, vdiff_thr;
+  uint32_t sse;
+
+  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
+  if (bs == BLOCK_16X16) {
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+  } else if (bs == BLOCK_32X32) {
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+  } else /* if (bs == BLOCK_64X64) */ {
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+  }
+
+  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+  // it might be a lighting change in smooth area. When there is a
+  // lighting change in smooth area, it is dangerous to do MFQE.
+  if (sad > 1 && vdiff > sad * 3) {
+    const int weight = 1 << MFQE_PRECISION;
+    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+    // When ifactor equals weight, no MFQE is done.
+    if (ifactor > weight) {
+      ifactor = weight;
+    }
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+                  uvd_stride, bs, ifactor);
+  } else {
+    // Copy the block from current frame (i.e., no mfqe is done).
+    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+               yd_stride, uvd_stride, bs);
+  }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+  // Check the motion in current block(for inter frame),
+  // or check the motion in the correlated block in last frame (for keyframe).
+  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+                            mi->mbmi.mv[0].as_mv.row +
+                            mi->mbmi.mv[0].as_mv.col *
+                            mi->mbmi.mv[0].as_mv.col;
+  const int mv_threshold = 100;
+  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
+         cur_bs >= BLOCK_16X16 &&
+         mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+                           const uint8_t *y, const uint8_t *u,
+                           const uint8_t *v, int y_stride, int uv_stride,
+                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                           int yd_stride, int uvd_stride) {
+  int mi_offset, y_offset, uv_offset;
+  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int bsl = b_width_log2_lookup[bs];
+  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+  const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+  if (cur_bs < BLOCK_8X8) {
+    // If there are blocks smaller than 8x8, it must be on the boundary.
+    return;
+  }
+  // No MFQE on blocks smaller than 16x16
+  if (bs == BLOCK_16X16) {
+    partition = PARTITION_NONE;
+  }
+  if (bs == BLOCK_64X64) {
+    mi_offset = 4;
+    y_offset = 32;
+    uv_offset = 16;
+  } else {
+    mi_offset = 2;
+    y_offset = 16;
+    uv_offset = 8;
+  }
+  switch (partition) {
+    BLOCK_SIZE mfqe_bs, bs_tmp;
+    case PARTITION_HORZ:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_64X32;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_32X16;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_VERT:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_32X64;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_16X32;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_NONE:
+      if (mfqe_decision(mi, cur_bs)) {
+        // Do mfqe on this partition.
+        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+      } else {
+        // Copy the block from current frame(i.e., no mfqe is done).
+        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                   yd_stride, uvd_stride, bs);
+      }
+      break;
+    case PARTITION_SPLIT:
+      // Recursion on four square partitions, e.g. if bs is 64X64,
+      // then look into four 32X32 blocks in it.
+      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+                     yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
+                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+                     y + y_offset * y_stride, u + uv_offset * uv_stride,
+                     v + uv_offset * uv_stride, y_stride, uv_stride,
+                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+                     subsize, y + y_offset * y_stride + y_offset,
+                     u + uv_offset * uv_stride + uv_offset,
+                     v + uv_offset * uv_stride + uv_offset, y_stride,
+                     uv_stride, yd + y_offset * yd_stride + y_offset,
+                     ud + uv_offset * uvd_stride + uv_offset,
+                     vd + uv_offset * uvd_stride + uv_offset,
+                     yd_stride, uvd_stride);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+  int mi_row, mi_col;
+  // Current decoded frame.
+  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+  // Last decoded frame and will store the MFQE result.
+  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+  // Loop through each super block.
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      MODE_INFO *mi;
+      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+      // Motion Info in last frame.
+      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+                           (mi_row * cm->mi_stride + mi_col);
+      const uint32_t y_stride = show->y_stride;
+      const uint32_t uv_stride = show->uv_stride;
+      const uint32_t yd_stride = dest->y_stride;
+      const uint32_t uvd_stride = dest->uv_stride;
+      const uint32_t row_offset_y = mi_row << 3;
+      const uint32_t row_offset_uv = mi_row << 2;
+      const uint32_t col_offset_y = mi_col << 3;
+      const uint32_t col_offset_uv = mi_col << 2;
+      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+                         col_offset_y;
+      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      if (frame_is_intra_only(cm)) {
+        mi = mi_prev;
+      } else {
+        mi = mi_local;
+      }
+      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+                     vd, yd_stride, uvd_stride);
+    }
+  }
+}

diff --git a/libvpx/vp9/common/vp9_mfqe.h b/libvpx/vp9/common/vp9_mfqe.h
new file mode 100644
index 0000000..dfff8c2
--- /dev/null
+++ b/libvpx/vp9/common/vp9_mfqe.h

@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MFQE_H_

diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index 3eb7f9d..5d89da8 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h

@@ -34,6 +34,14 @@
   int32_t col;
 } MV32;
 
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return  *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
 static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
                             int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);

diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index ab64d30..77d1ff4 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c

@@ -14,25 +14,22 @@
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                             const TileInfo *const tile,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
-                             int block, int mi_row, int mi_col) {
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data,
+                             uint8_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
-        ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
-
-
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-
   int different_ref_found = 0;
   int context_counter = 0;
+  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
 
   // Blank the reference vector list
-  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
@@ -48,9 +45,11 @@
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
       else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -65,18 +64,34 @@
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0]);
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
       else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1]);
+        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, Done);
     }
   }
 
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unncessary if use_perv_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
   // Check the last frame's mode and mv info.
-  if (prev_mbmi) {
-    if (prev_mbmi->ref_frame[0] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[0]);
-    else if (prev_mbmi->ref_frame[1] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[1]);
+  if (cm->use_prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
   }
 
   // Since we couldn't find 2 mvs from the same reference frame
@@ -90,18 +105,41 @@
                                               * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
       }
     }
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi)
-    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
 
  Done:
 
-  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+  mode_context[ref_frame] = counter_to_context[context_counter];
 
   // Clamp vectors
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
@@ -109,12 +147,13 @@
 }
 
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
-  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col);
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context) {
+  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col, sync, data, mode_context);
 }
 
 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -127,23 +166,23 @@
   }
 }
 
-
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near) {
+                           int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv) {
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     lower_mv_precision(&mvlist[i].as_mv, allow_hp);
     clamp_mv2(&mvlist[i].as_mv, xd);
   }
-  *nearest = mvlist[0];
-  *near = mvlist[1];
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest, int_mv *near) {
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context) {
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
@@ -151,21 +190,21 @@
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
+                   mi_row, mi_col, NULL, NULL, mode_context);
 
-  near->as_int = 0;
+  near_mv->as_int = 0;
   switch (block) {
     case 0:
-      nearest->as_int = mv_list[0].as_int;
-      near->as_int = mv_list[1].as_int;
+      nearest_mv->as_int = mv_list[0].as_int;
+      near_mv->as_int = mv_list[1].as_int;
       break;
     case 1:
     case 2:
-      nearest->as_int = bmi[0].as_mv[ref].as_int;
+      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
       for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest->as_int != mv_list[n].as_int) {
-          near->as_int = mv_list[n].as_int;
+        if (nearest_mv->as_int != mv_list[n].as_int) {
+          near_mv->as_int = mv_list[n].as_int;
           break;
         }
       break;
@@ -176,15 +215,15 @@
       candidates[2] = mv_list[0];
       candidates[3] = mv_list[1];
 
-      nearest->as_int = bmi[2].as_mv[ref].as_int;
+      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
       for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest->as_int != candidates[n].as_int) {
-          near->as_int = candidates[n].as_int;
+        if (nearest_mv->as_int != candidates[n].as_int) {
+          near_mv->as_int = candidates[n].as_int;
           break;
         }
       break;
     }
     default:
-      assert("Invalid block index.");
+      assert(0 && "Invalid block index.");
   }
 }

diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index a937b78..bd216d4 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h

@@ -158,29 +158,32 @@
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
   do { \
     if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
         goto Done; \
       } \
     } else { \
-      mv_ref_list[refmv_count++] = (mv); \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
     } \
   } while (0)
 
 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+                                 mv_ref_list, Done) \
   do { \
     if (is_inter_block(mbmi)) { \
       if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
       if (has_second_ref(mbmi) && \
           (mbmi)->ref_frame[1] != ref_frame && \
           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
     } \
   } while (0)
 
@@ -204,21 +207,23 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col);
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near);
+                           int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest, int_mv *near);
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index dff077c..c373c02 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h

@@ -13,7 +13,9 @@
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_util/vpx_thread.h"
 #include "./vp9_rtcd.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
@@ -35,14 +37,19 @@
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
 
+#define NUM_PING_PONG_BUFFERS 2
+
 extern const struct {
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
@@ -56,21 +63,55 @@
   REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
 
 typedef struct {
   int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VPxWorker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
 } RefCntBuffer;
 
+typedef struct BufferPool {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
-
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
-
-  COLOR_SPACE color_space;
-
+  vpx_color_space_t color_space;
   int width;
   int height;
   int display_width;
@@ -84,12 +125,22 @@
   int subsampling_x;
   int subsampling_y;
 
-  YV12_BUFFER_CONFIG *frame_to_show;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+#endif
 
-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+  YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
 
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
@@ -98,7 +149,10 @@
 
   int new_fb_idx;
 
+#if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
@@ -108,7 +162,8 @@
   int show_existing_frame;
 
   // Flag signaling that the frame is encoded using only INTRA modes.
-  int intra_only;
+  uint8_t intra_only;
+  uint8_t last_intra_only;
 
   int allow_high_precision_mv;
 
@@ -131,27 +186,43 @@
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
 
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
-
-  int mi_idx;
-  int prev_mi_idx;
-  MODE_INFO *mip_array[2];
-  MODE_INFO **mi_grid_base_array[2];
-
+  int mi_alloc_size;
   MODE_INFO *mip; /* Base of allocated array */
   MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in VP9 encoder.
   MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
   MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct VP9Common *cm, int mi_size);
+  void (*free_mi)(struct VP9Common *cm);
+  void (*setup_mi)(struct VP9Common *cm);
+
+  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
   MODE_INFO **mi_grid_base;
   MODE_INFO **mi_grid_visible;
   MODE_INFO **prev_mi_grid_base;
   MODE_INFO **prev_mi_grid_visible;
 
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
   // Persistent mb segment id map used in prediction.
-  unsigned char *last_frame_seg_map;
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
 
   INTERP_FILTER interp_filter;
 
@@ -164,22 +235,26 @@
   struct loopfilter lf;
   struct segmentation seg;
 
+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
   // Context probabilities for reference frame prediction
-  int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
   REFERENCE_MODE reference_mode;
 
-  FRAME_CONTEXT fc;  /* this frame entropy */
-  FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
+  FRAME_CONTEXT *fc;  /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;   // FRAME_CONTEXTS
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
 
-  // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2
-  BIT_DEPTH bit_depth;
+  // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+  vpx_bit_depth_t bit_depth;
+  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
 
 #if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
@@ -189,6 +264,8 @@
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
+  int byte_alignment;
+  int skip_loop_filter;
 
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
@@ -198,31 +275,49 @@
   // Handles memory for the codec.
   InternalFrameBufferList int_frame_buffers;
 
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
+  int above_context_alloc_cols;
 } VP9_COMMON;
 
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES)
     return NULL;
   if (cm->ref_frame_map[index] < 0)
     return NULL;
-  assert(cm->ref_frame_map[index] < REF_FRAMES);
-  return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
 
 static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
-  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (cm->frame_bufs[i].ref_count == 0)
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
       break;
 
-  assert(i < FRAME_BUFFERS);
-  cm->frame_bufs[i].ref_count = 1;
+  if (i != FRAME_BUFFERS) {
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
@@ -241,27 +336,46 @@
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
-static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].dqcoeff = xd->dqcoeff[i];
-    xd->above_context[i] = cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-  }
-
-  xd->above_seg_context = cm->above_seg_context;
-  xd->mi_stride = cm->mi_stride;
-}
-
 static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
   return cm->frame_type == KEY_FRAME || cm->intra_only;
 }
 
-static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
+static INLINE void set_partition_probs(const VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd) {
+  xd->partition_probs =
+      frame_is_intra_only(cm) ?
+          &vp9_kf_partition_probs[0] :
+          (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+}
+
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+    xd->above_context[i] = cm->above_context +
+        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
+
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+    } else {
+      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+    }
+    xd->fc = cm->fc;
+    xd->frame_parallel_decoding_mode = cm->frame_parallel_decoding_mode;
+  }
+
+  xd->above_seg_context = cm->above_seg_context;
+  xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
+
+  set_partition_probs(cm, xd);
+}
+
+static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd,
                                                   int ctx) {
-  return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
-                                 : cm->fc.partition_prob[ctx];
+  return xd->partition_probs[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -275,6 +389,11 @@
   }
 }
 
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MI_BLOCK_SIZE;
+}
+
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh,
                                   int mi_col, int bw,
@@ -287,17 +406,23 @@
   // Are edges available for intra prediction?
   xd->up_available    = (mi_row != 0);
   xd->left_available  = (mi_col > tile->mi_col_start);
-}
+  if (xd->up_available) {
+    xd->above_mi = xd->mi[-xd->mi_stride];
+    // above_mi may be NULL in VP9 encoder's first pass.
+    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+  } else {
+    xd->above_mi = NULL;
+    xd->above_mbmi = NULL;
+  }
 
-static INLINE void set_prev_mi(VP9_COMMON *cm) {
-  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
-                                       cm->height == cm->last_height &&
-                                       !cm->intra_only &&
-                                       cm->last_show_frame;
-  // Special case: set prev_mi to NULL when the previous mode info
-  // context cannot be used.
-  cm->prev_mi = use_prev_in_find_mv_refs ?
-                  cm->prev_mip + cm->mi_stride + 1 : NULL;
+  if (xd->left_available) {
+    xd->left_mi = xd->mi[-1];
+    // left_mi may be NULL in VP9 encoder's first pass.
+    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+  } else {
+    xd->left_mi = NULL;
+    xd->left_mbmi = NULL;
+  }
 }
 
 static INLINE void update_partition_context(MACROBLOCKD *xd,
@@ -313,8 +438,8 @@
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
-  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
+  memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
 static INLINE int partition_plane_context(const MACROBLOCKD *xd,
@@ -322,21 +447,12 @@
                                           BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const int bsl = mi_width_log2_lookup[bsize];
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
-  const int bsl = mi_width_log2(bsize);
-  const int bs = 1 << bsl;
-  int above = 0, left = 0, i;
-
-  assert(b_width_log2(bsize) == b_height_log2(bsize));
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
   assert(bsl >= 0);
 
-  for (i = 0; i < bs; i++) {
-    above |= above_ctx[i];
-    left |= left_ctx[i];
-  }
-  above = (above & bs) > 0;
-  left  = (left & bs) > 0;
-
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 

diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index abda4e6..71ab861 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c

@@ -16,20 +16,21 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
 
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_textblit.h"
 
 #if CONFIG_VP9_POSTPROC
-static const short kernel5[] = {
+static const int16_t kernel5[] = {
   1, 1, 4, 1, 1
 };
 
-const short vp9_rv[] = {
+const int16_t vp9_rv[] = {
   8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
   0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
   10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
@@ -76,6 +77,9 @@
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -85,10 +89,7 @@
                                      int flimit) {
   uint8_t const *p_src;
   uint8_t *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
+  int row, col, i, v, kernel;
   int pitch = src_pixels_per_line;
   uint8_t d[8];
   (void)dst_pixels_per_line;
@@ -99,8 +100,8 @@
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      int v = p_src[col];
+      kernel = 4;
+      v = p_src[col];
 
       for (i = -2; i <= 2; i++) {
         if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -122,7 +123,7 @@
       d[i] = p_src[i];
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
+      kernel = 4;
       v = p_src[col];
 
       d[col & 7] = v;
@@ -152,6 +153,81 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+                                            uint16_t *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line,
+                                            int rows,
+                                            int cols,
+                                            int flimit) {
+  uint16_t const *p_src;
+  uint16_t *p_dst;
+  int row, col, i, v, kernel;
+  int pitch = src_pixels_per_line;
+  uint16_t d[8];
+
+  for (row = 0; row < rows; row++) {
+    // post_proc_down for one row.
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+
+    across_skip_convolve:
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int q2mbl(int x) {
   if (x < 20) x = 20;
 
@@ -162,10 +238,46 @@
 void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                  int rows, int cols, int flimit) {
   int r, c, i;
-
   uint8_t *s = src;
   uint8_t d[16];
 
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+    s += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+                                        int rows, int cols, int flimit) {
+  int r, c, i;
+
+  uint16_t *s = src;
+  uint16_t d[16];
+
 
   for (r = 0; r < rows; r++) {
     int sumsq = 0;
@@ -196,6 +308,7 @@
     s += pitch;
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
@@ -207,7 +320,7 @@
     int sumsq = 0;
     int sum   = 0;
     uint8_t d[16];
-    const short *rv2 = rv3 + ((c * 17) & 127);
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
     for (i = -8; i <= 6; i++) {
       sumsq += s[i * pitch] * s[i * pitch];
@@ -229,6 +342,40 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+                                   int rows, int cols, int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &vp9_rv[63 & rand()];  // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint16_t *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    uint16_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
                                        YV12_BUFFER_CONFIG   *post,
                                        int                   q,
@@ -239,6 +386,51 @@
   (void) low_var_thresh;
   (void) flag;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+                                         CONVERT_TO_SHORTPTR(post->y_buffer),
+                                         source->y_stride, post->y_stride,
+                                         source->y_height, source->y_width,
+                                         ppl);
+
+    vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                     post->y_stride, post->y_height,
+                                     post->y_width, q2mbl(q));
+
+    vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                post->y_stride, post->y_height,
+                                post->y_width, q2mbl(q));
+
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+                                         CONVERT_TO_SHORTPTR(post->u_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+                                         CONVERT_TO_SHORTPTR(post->v_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+  } else {
+    vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                  source->y_stride, post->y_stride,
+                                  source->y_height, source->y_width, ppl);
+
+    vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+
+    vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+
+    vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+    vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+  }
+#else
   vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
                                 source->y_stride, post->y_stride,
                                 source->y_height, source->y_width, ppl);
@@ -255,6 +447,7 @@
   vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
                                 source->uv_stride, post->uv_stride,
                                 source->uv_height, source->uv_width, ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -271,10 +464,26 @@
   uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
-  for (i = 0; i < MAX_MB_PLANE; ++i)
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+                                           CONVERT_TO_SHORTPTR(dsts[i]),
+                                           src_strides[i], dst_strides[i],
+                                           src_heights[i], src_widths[i], ppl);
+    } else {
+      vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                    src_strides[i], dst_strides[i],
+                                    src_heights[i], src_widths[i], ppl);
+    }
+#else
     vp9_post_proc_down_and_across(srcs[i], dsts[i],
                                   src_strides[i], dst_strides[i],
                                   src_heights[i], src_widths[i], ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 }
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
@@ -293,15 +502,34 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     const int src_stride = src_strides[i];
-    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
     const int src_width = src_widths[i] - 4;
     const int src_height = src_heights[i] - 4;
-
     const int dst_stride = dst_strides[i];
-    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
 
-    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+          srcs[i] + 2 * src_stride + 2);
+      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+          dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                           dst_stride, src_height, src_width,
+                                           ppl);
+    } else {
+      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+
+      vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                    dst_stride, src_height, src_width, ppl);
+    }
+#else
+    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+    vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
                                   src_height, src_width, ppl);
+#endif
   }
 }
 
@@ -316,7 +544,7 @@
   double sigma;
   int ai = a, qi = q, i;
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   sigma = ai + .5 + .6 * (63 - qi) / 63.0;
 
@@ -324,16 +552,15 @@
    * a gaussian distribution with sigma determined by q.
    */
   {
-    double i;
     int next, j;
 
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
-      if (a) {
-        for (j = 0; j < a; j++) {
+      if (a_i) {
+        for (j = 0; j < a_i; j++) {
           char_dist[next + j] = (char) i;
         }
 
@@ -366,6 +593,9 @@
                            unsigned int width, unsigned int height, int pitch) {
   unsigned int i, j;
 
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
   for (i = 0; i < height; i++) {
     uint8_t *pos = start + i * pitch;
     char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
@@ -382,9 +612,20 @@
   }
 }
 
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->postproc_state.prev_mip;
+  cm->postproc_state.prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+  const int q = MIN(105, cm->lf.filter_level * 2);
   const int flags = ppflags->post_proc_flag;
   YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
   struct postproc_state *const ppstate = &cm->postproc_state;
@@ -397,17 +638,76 @@
     return 0;
   }
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
-  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
+  // Alloc memory for prev_mip in the first frame.
+  if (cm->current_video_frame == 1) {
+    cm->postproc_state.last_base_qindex = cm->base_qindex;
+    cm->postproc_state.last_frame_valid = 1;
+    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+    if (!ppstate->prev_mip) {
+      return 1;
+    }
+    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+    memset(ppstate->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  }
+
+  // Allocate post_proc_buffer_int if needed.
+  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+      if (vpx_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cm->byte_alignment) < 0) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate MFQE framebuffer");
+      }
+
+      // Ensure that postproc is set to all 0s so that post proc
+      // doesn't pull random data in from edge.
+      memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+             cm->post_proc_buffer.frame_size);
+    }
+  }
+
+  if (vpx_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
-#endif
 
-  if (flags & VP9D_DEMACROBLOCK) {
+  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
+      cm->postproc_state.last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+    vp9_mfqe(cm);
+    // TODO(jackychen): Consider whether enable deblocking by default
+    // if mfqe is enabled. Need to take both the quality and the speed
+    // into consideration.
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+    }
+    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+                                 q + (ppflags->deblocking_level - 5) * 10,
+                                 1, 0);
+    } else if (flags & VP9D_DEBLOCK) {
+      vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+    } else {
+      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+    }
+  } else if (flags & VP9D_DEMACROBLOCK) {
     deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
                                q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
@@ -416,6 +716,9 @@
     vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
+  cm->postproc_state.last_base_qindex = cm->base_qindex;
+  cm->postproc_state.last_frame_valid = 1;
+
   if (flags & VP9D_ADDNOISE) {
     const int noise_level = ppflags->noise_level;
     if (ppstate->last_q != q ||
@@ -436,6 +739,7 @@
   dest->uv_width = dest->y_width >> cm->subsampling_x;
   dest->uv_height = dest->y_height >> cm->subsampling_y;
 
+  swap_mi_and_prev_mi(cm);
   return 0;
 }
-#endif
+#endif  // CONFIG_VP9_POSTPROC

diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h
index ebebc1a..035c9cd 100644
--- a/libvpx/vp9/common/vp9_postproc.h
+++ b/libvpx/vp9/common/vp9_postproc.h

@@ -14,6 +14,8 @@
 
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
 #include "vp9/common/vp9_ppflags.h"
 
 #ifdef __cplusplus
@@ -24,6 +26,10 @@
   int last_q;
   int last_noise;
   char noise[3072];
+  int last_base_qindex;
+  int last_frame_valid;
+  MODE_INFO *prev_mip;
+  MODE_INFO *prev_mi;
   DECLARE_ALIGNED(16, char, blackclamp[16]);
   DECLARE_ALIGNED(16, char, whiteclamp[16]);
   DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@
 
 struct VP9Common;
 
+#define MFQE_PRECISION 4
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 

diff --git a/libvpx/vp9/common/vp9_ppflags.h b/libvpx/vp9/common/vp9_ppflags.h
index 1644a1b..12b989f 100644
--- a/libvpx/vp9/common/vp9_ppflags.h
+++ b/libvpx/vp9/common/vp9_ppflags.h

@@ -26,7 +26,8 @@
   VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
   VP9D_DEBUG_DRAW_MV          = 1 << 7,
   VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+  VP9D_MFQE                   = 1 << 10
 };
 
 typedef struct {

diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 0146384..1f16325 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c

@@ -9,27 +9,21 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <limits.h>
-
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
-static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) {
-  return (mi != NULL) ? &mi->mbmi : NULL;
-}
-
 // Returns a context number for the given MB prediction signal
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ?
-                           left_mbmi->interp_filter : SWITCHABLE_FILTERS;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ?
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
+                            left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
                              above_mbmi->interp_filter : SWITCHABLE_FILTERS;
 
   if (left_type == above_type)
@@ -50,10 +44,10 @@
 // 2 - intra/--, --/intra
 // 3 - intra/intra
 int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
 
   if (has_above && has_left) {  // both edges available
     const int above_intra = !is_inter_block(above_mbmi);
@@ -70,10 +64,10 @@
 int vp9_get_reference_mode_context(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
   int ctx;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -113,10 +107,10 @@
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int above_in_image = above_mbmi != NULL;
-  const int left_in_image = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
 
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -194,10 +188,10 @@
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -260,10 +254,10 @@
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
 
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -343,43 +337,3 @@
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
 }
-// Returns a context number for the given MB prediction signal
-// The mode info data structure has a one element border above and to the
-// left of the entries corresponding to real blocks.
-// The prediction flags in these dummy entries are initialized to 0.
-int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
-  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
-  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
-                                                   : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
-                                                : max_tx_size;
-  if (!has_left)
-    left_ctx = above_ctx;
-
-  if (!has_above)
-    above_ctx = left_ctx;
-
-  return (above_ctx + left_ctx) > max_tx_size;
-}
-
-int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int x, y, segment_id = INT_MAX;
-
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      segment_id = MIN(segment_id,
-                       segment_ids[mi_offset + y * cm->mi_cols + x]);
-
-  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
-  return segment_id;
-}

diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 2c96506..67b95db 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h

@@ -18,20 +18,28 @@
 extern "C" {
 #endif
 
-static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
-  return xd->up_available ? xd->mi[-xd->mi_stride] : NULL;
-}
+static INLINE int get_segment_id(const VP9_COMMON *cm,
+                                 const uint8_t *segment_ids,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = MAX_SEGMENTS;
 
-static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
-  return xd->left_available ? xd->mi[-1] : NULL;
-}
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_id = MIN(segment_id,
+                       segment_ids[mi_offset + y * cm->mi_cols + x]);
 
-int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
-                       BLOCK_SIZE bsize, int mi_row, int mi_col);
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
 
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const int above_sip = (above_mi != NULL) ?
                         above_mi->mbmi.seg_id_predicted : 0;
   const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
@@ -39,66 +47,87 @@
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
+static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
 
 static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
   const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
   return above_skip + left_skip;
 }
 
-static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
-  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
+  return cm->fc->skip_probs[vp9_get_skip_context(xd)];
 }
 
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
 int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
-  return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
+  return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
-  return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
+  return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
 
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc.comp_ref_prob[pred_context];
+  return cm->fc->comp_ref_prob[pred_context];
 }
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
-int vp9_get_tx_size_context(const MACROBLOCKD *xd);
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
+                                                   : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
+                                                : max_tx_size;
+  if (!has_left)
+    left_ctx = above_ctx;
 
-static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+  if (!has_above)
+    above_ctx = left_ctx;
+
+  return (above_ctx + left_ctx) > max_tx_size;
+}
+
+static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
                                            const struct tx_probs *tx_probs) {
   switch (max_tx_size) {
     case TX_8X8:
@@ -113,10 +142,10 @@
   }
 }
 
-static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
                                             const MACROBLOCKD *xd,
                                             const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
+  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
 }
 
 static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,

diff --git a/libvpx/vp9/common/vp9_prob.h b/libvpx/vp9/common/vp9_prob.h
deleted file mode 100644
index fa0e36d..0000000
--- a/libvpx/vp9/common/vp9_prob.h
+++ /dev/null

@@ -1,84 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_PROB_H_
-#define VP9_COMMON_VP9_PROB_H_
-
-#include "./vpx_config.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef uint8_t vp9_prob;
-
-#define MAX_PROB 255
-
-#define vp9_prob_half ((vp9_prob) 128)
-
-typedef int8_t vp9_tree_index;
-
-#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
-
-#define vp9_complement(x) (255 - x)
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of vp9_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const vp9_tree_index vp9_tree[];
-
-static INLINE vp9_prob clip_prob(int p) {
-  return (p > 255) ? 255 : (p < 1) ? 1 : p;
-}
-
-static INLINE vp9_prob get_prob(int num, int den) {
-  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
-}
-
-static INLINE vp9_prob get_binary_prob(int n0, int n1) {
-  return get_prob(n0, n0 + n1);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
-  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = MIN(ct[0] + ct[1], count_sat);
-  const unsigned int factor = max_update_factor * count / count_sat;
-  return weighted_prob(pre_prob, prob, factor);
-}
-
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs);
-
-
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_COMMON_VP9_PROB_H_

diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index 3332e58..d83f3c1 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c

@@ -47,6 +47,78 @@
   1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    10,    13,    15,    17,    20,    22,
+  25,    28,    31,    34,    37,    40,    43,    47,
+  50,    53,    57,    60,    64,    68,    71,    75,
+  78,    82,    86,    90,    93,    97,   101,   105,
+  109,   113,   116,   120,   124,   128,   132,   136,
+  140,   143,   147,   151,   155,   159,   163,   166,
+  170,   174,   178,   182,   185,   189,   193,   197,
+  200,   204,   208,   212,   215,   219,   223,   226,
+  230,   233,   237,   241,   244,   248,   251,   255,
+  259,   262,   266,   269,   273,   276,   280,   283,
+  287,   290,   293,   297,   300,   304,   307,   310,
+  314,   317,   321,   324,   327,   331,   334,   337,
+  343,   350,   356,   362,   369,   375,   381,   387,
+  394,   400,   406,   412,   418,   424,   430,   436,
+  442,   448,   454,   460,   466,   472,   478,   484,
+  490,   499,   507,   516,   525,   533,   542,   550,
+  559,   567,   576,   584,   592,   601,   609,   617,
+  625,   634,   644,   655,   666,   676,   687,   698,
+  708,   718,   729,   739,   749,   759,   770,   782,
+  795,   807,   819,   831,   844,   856,   868,   880,
+  891,   906,   920,   933,   947,   961,   975,   988,
+  1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+  1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+  1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+  1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+  1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+  1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+  1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+  2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+  2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+  3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+  4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+};
+
+static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+  4,    12,    18,    25,    33,    41,    50,    60,
+  70,    80,    91,   103,   115,   127,   140,   153,
+  166,   180,   194,   208,   222,   237,   251,   266,
+  281,   296,   312,   327,   343,   358,   374,   390,
+  405,   421,   437,   453,   469,   484,   500,   516,
+  532,   548,   564,   580,   596,   611,   627,   643,
+  659,   674,   690,   706,   721,   737,   752,   768,
+  783,   798,   814,   829,   844,   859,   874,   889,
+  904,   919,   934,   949,   964,   978,   993,  1008,
+  1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+  1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+  1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+  1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+  2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+  2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+  3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+  3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+  5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+  5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+  6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+  7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+  8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+  10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+  12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+  16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+};
+#endif
+
 static const int16_t ac_qlookup[QINDEX_RANGE] = {
   4,       8,    9,   10,   11,   12,   13,   14,
   15,     16,   17,   18,   19,   20,   21,   22,
@@ -82,19 +154,120 @@
   1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-int16_t vp9_dc_quant(int qindex, int delta) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    11,    13,    16,    18,    21,    24,
+  27,    30,    33,    37,    40,    44,    48,    51,
+  55,    59,    63,    67,    71,    75,    79,    83,
+  88,    92,    96,   100,   105,   109,   114,   118,
+  122,   127,   131,   136,   140,   145,   149,   154,
+  158,   163,   168,   172,   177,   181,   186,   190,
+  195,   199,   204,   208,   213,   217,   222,   226,
+  231,   235,   240,   244,   249,   253,   258,   262,
+  267,   271,   275,   280,   284,   289,   293,   297,
+  302,   306,   311,   315,   319,   324,   328,   332,
+  337,   341,   345,   349,   354,   358,   362,   367,
+  371,   375,   379,   384,   388,   392,   396,   401,
+  409,   417,   425,   433,   441,   449,   458,   466,
+  474,   482,   490,   498,   506,   514,   523,   531,
+  539,   547,   555,   563,   571,   579,   588,   596,
+  604,   616,   628,   640,   652,   664,   676,   688,
+  700,   713,   725,   737,   749,   761,   773,   785,
+  797,   809,   825,   841,   857,   873,   889,   905,
+  922,   938,   954,   970,   986,  1002,  1018,  1038,
+  1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+  1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+  1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+  1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+  1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+  2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+  2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+  2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+  3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+  4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+  4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+  5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+  6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+};
+
+static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+  4,    13,    19,    27,    35,    44,    54,    64,
+  75,    87,    99,   112,   126,   139,   154,   168,
+  183,   199,   214,   230,   247,   263,   280,   297,
+  314,   331,   349,   366,   384,   402,   420,   438,
+  456,   475,   493,   511,   530,   548,   567,   586,
+  604,   623,   642,   660,   679,   698,   716,   735,
+  753,   772,   791,   809,   828,   846,   865,   884,
+  902,   920,   939,   957,   976,   994,  1012,  1030,
+  1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+  1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+  1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+  1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+  2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+  2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+  3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+  4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+  4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+  7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+  8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+  10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+  11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+  13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+  16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+  18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+  21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+  25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+};
+#endif
+
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
 }
 
-int16_t vp9_ac_quant(int qindex, int delta) {
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
   return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
 }
 
-
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
     const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
         data : base_qindex + data;
     return clamp(seg_qindex, 0, MAXQ);

diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h
index d1545d9..4bae4a8 100644
--- a/libvpx/vp9/common/vp9_quant_common.h
+++ b/libvpx/vp9/common/vp9_quant_common.h

@@ -11,7 +11,8 @@
 #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
 #define VP9_COMMON_VP9_QUANT_COMMON_H_
 
-#include "vp9/common/vp9_blockd.h"
+#include "vpx/vpx_codec.h"
+#include "vp9/common/vp9_seg_common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,8 +23,8 @@
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
 
-int16_t vp9_dc_quant(int qindex, int delta);
-int16_t vp9_ac_quant(int qindex, int delta);
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
 
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);

diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 86ae648..f83f825 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c

@@ -16,66 +16,45 @@
 #include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-static void inter_predictor(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            const int subpel_x,
-                            const int subpel_y,
-                            const struct scale_factors *sf,
-                            int w, int h, int ref,
-                            const InterpKernel *kernel,
-                            int xs, int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int subpel_x,
+                                 const int subpel_y,
+                                 const struct scale_factors *sf,
+                                 int w, int h, int ref,
+                                 const InterpKernel *kernel,
+                                 int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
 }
 
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *src_mv,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int ref,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *src_mv,
@@ -151,8 +130,8 @@
   return clamped_mv;
 }
 
-static MV average_split_mvs(const struct macroblockd_plane *pd,
-                            const MODE_INFO *mi, int ref, int block) {
+MV average_split_mvs(const struct macroblockd_plane *pd,
+                     const MODE_INFO *mi, int ref, int block) {
   const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
   MV res = {0, 0};
   switch (ss_idx) {
@@ -169,7 +148,7 @@
       res = mi_mv_pred_q4(mi, ref);
       break;
     default:
-      assert(ss_idx <= 3 || ss_idx >= 0);
+      assert(ss_idx <= 3 && ss_idx >= 0);
   }
   return res;
 }
@@ -181,7 +160,7 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
   int ref;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -205,8 +184,9 @@
     uint8_t *pre;
     MV32 scaled_mv;
     int xs, ys, subpel_x, subpel_y;
+    const int is_scaled = vp9_is_scaled(sf);
 
-    if (vp9_is_scaled(sf)) {
+    if (is_scaled) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
@@ -222,8 +202,19 @@
     pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
            + (scaled_mv.col >> SUBPEL_BITS);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                           xd->bd);
+    } else {
+      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+    }
+#else
     inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                     subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 }
 
@@ -259,187 +250,31 @@
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
 }
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
 }
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                     MAX_MB_PLANE - 1);
 }
 
-// TODO(jingning): This function serves as a placeholder for decoder prediction
-// using on demand border extension. It should be moved to /decoder/ directory.
-static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-                                       int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  int ref;
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, mi, ref, block)
-               : mi->mbmi.mv[ref].as_mv;
-
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-
-    MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
-        subpel_x, subpel_y;
-    uint8_t *ref_frame, *buf_ptr;
-    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
-
-    // Get reference frame pointer, width and height.
-    if (plane == 0) {
-      frame_width = ref_buf->y_crop_width;
-      frame_height = ref_buf->y_crop_height;
-      ref_frame = ref_buf->y_buffer;
-    } else {
-      frame_width = ref_buf->uv_crop_width;
-      frame_height = ref_buf->uv_crop_height;
-      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
-    }
-
-    if (vp9_is_scaled(sf)) {
-      // Co-ordinate of containing block to pixel precision.
-      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = (x_start + x) << SUBPEL_BITS;
-      y0_16 = (y_start + y) << SUBPEL_BITS;
-
-      // Co-ordinate of current block in reference frame
-      // to 1/16th pixel precision.
-      x0_16 = sf->scale_value_x(x0_16, sf);
-      y0_16 = sf->scale_value_y(y0_16, sf);
-
-      // Map the top left corner of the block into the reference frame.
-      x0 = sf->scale_value_x(x_start + x, sf);
-      y0 = sf->scale_value_y(y_start + y, sf);
-
-      // Scale the MV and incorporate the sub-pixel offset of the block
-      // in the reference frame.
-      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-      xs = sf->x_step_q4;
-      ys = sf->y_step_q4;
-    } else {
-      // Co-ordinate of containing block to pixel precision.
-      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = x0 << SUBPEL_BITS;
-      y0_16 = y0 << SUBPEL_BITS;
-
-      scaled_mv.row = mv_q4.row;
-      scaled_mv.col = mv_q4.col;
-      xs = ys = 16;
-    }
-    subpel_x = scaled_mv.col & SUBPEL_MASK;
-    subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-    // Calculate the top left corner of the best matching block in the reference frame.
-    x0 += scaled_mv.col >> SUBPEL_BITS;
-    y0 += scaled_mv.row >> SUBPEL_BITS;
-    x0_16 += scaled_mv.col;
-    y0_16 += scaled_mv.row;
-
-    // Get reference block pointer.
-    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-    buf_stride = pre_buf->stride;
-
-    // Do border extension if there is motion or the
-    // width/height is not a multiple of 8 pixels.
-    if (scaled_mv.col || scaled_mv.row ||
-        (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-      int x_pad = 0, y_pad = 0;
-
-      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
-        x0 -= VP9_INTERP_EXTEND - 1;
-        x1 += VP9_INTERP_EXTEND;
-        x_pad = 1;
-      }
-
-      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
-        y0 -= VP9_INTERP_EXTEND - 1;
-        y1 += VP9_INTERP_EXTEND;
-        y_pad = 1;
-      }
-
-      // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
-        // Extend the border.
-        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
-                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
-                        frame_height);
-        buf_stride = x1 - x0 + 1;
-        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-      }
-    }
-
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-}
-
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
-    } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
-    }
-  }
-}
-
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
-  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                               src->alpha_buffer};
-  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                          src->alpha_stride};
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+      src->v_buffer};
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+      src->uv_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -455,11 +290,10 @@
                           const struct scale_factors *sf) {
   if (src != NULL) {
     int i;
-    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                 src->alpha_buffer};
-    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                            src->alpha_stride};
-
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+        src->v_buffer};
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+        src->uv_stride};
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,

diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 58c596e..7d90774 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h

@@ -11,25 +11,57 @@
 #ifndef VP9_COMMON_VP9_RECONINTER_H_
 #define VP9_COMMON_VP9_RECONINTER_H_
 
-#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const int subpel_x,
+                                   const int subpel_y,
+                                   const struct scale_factors *sf,
+                                   int w, int h, int ref,
+                                   const InterpKernel *kernel,
+                                   int xs, int ys) {
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int subpel_x,
+                                 const int subpel_y,
+                                 const struct scale_factors *sf,
+                                 int w, int h, int ref,
+                                 const InterpKernel *kernel,
+                                 int xs, int ys, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
+                     int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y);
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane);
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
@@ -39,6 +71,17 @@
                                enum mv_precision precision,
                                int x, int y);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *mv_q3,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int do_avg,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd);
+#endif
+
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                        const struct scale_factors *sf) {
   const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;

diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index 403e105..e60eff8 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c

@@ -9,12 +9,15 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_once.h"
 
-#include "./vp9_rtcd.h"
-
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
@@ -31,319 +34,112 @@
   ADST_ADST,  // TM
 };
 
-// This serves as a wrapper function, so that all the prediction functions
-// can be unified and accessed as a pointer array. Note that the boundary
-// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, size) \
-  void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
-                                                  ptrdiff_t stride, \
-                                                  const uint8_t *above, \
-                                                  const uint8_t *left) { \
-    type##_predictor(dst, stride, size, above, left); \
-  }
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+};
 
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 4) \
-  intra_pred_sized(type, 8) \
-  intra_pred_sized(type, 16) \
-  intra_pred_sized(type, 32)
-
-static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) above;
-  // first column
-  for (r = 0; r < bs - 1; ++r)
-    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // second column
-  for (r = 0; r < bs - 2; ++r)
-    dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 +
-                                         left[r + 2], 2);
-  dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] +
-                                              left[bs - 1] * 3, 2);
-  dst[(bs - 1) * stride] = left[bs - 1];
-  dst++;
-
-  // rest of last row
-  for (c = 0; c < bs - 2; ++c)
-    dst[(bs - 1) * stride + c] = left[bs - 1];
-
-  for (r = bs - 2; r >= 0; --r)
-    for (c = 0; c < bs - 2; ++c)
-      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
-}
-intra_pred_allsizes(d207)
-
-static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                 const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
-                                          above[r/2 + c + 1] * 2 +
-                                          above[r/2 + c + 2], 2)
-                     : ROUND_POWER_OF_TWO(above[r/2 + c] +
-                                          above[r/2 + c + 1], 1);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(d63)
-
-static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                 const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void) left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dst[c] = r + c + 2 < bs * 2 ?  ROUND_POWER_OF_TWO(above[r + c] +
-                                                        above[r + c + 1] * 2 +
-                                                        above[r + c + 2], 2)
-                                  : above[bs * 2 - 1];
-    dst += stride;
-  }
-}
-intra_pred_allsizes(d45)
-
-static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-
-  // first row
-  for (c = 0; c < bs; c++)
-    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1);
-  dst += stride;
-
-  // second row
-  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
-  for (c = 1; c < bs; c++)
-    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
-  dst += stride;
-
-  // the rest of first col
-  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
-  for (r = 3; r < bs; ++r)
-    dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 +
-                                               left[r - 1], 2);
-
-  // the rest of the block
-  for (r = 2; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
-}
-intra_pred_allsizes(d117)
-
-static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
-  for (c = 1; c < bs; c++)
-    dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2);
-
-  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
-                                         left[r], 2);
-
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-stride + c - 1];
-    dst += stride;
-  }
-}
-intra_pred_allsizes(d135)
-
-static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1);
-  for (r = 1; r < bs; r++)
-    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1);
-  dst++;
-
-  dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2);
-  dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2);
-  for (r = 2; r < bs; r++)
-    dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 +
-                                         left[r], 2);
-  dst++;
-
-  for (c = 0; c < bs - 2; c++)
-    dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2);
-  dst += stride;
-
-  for (r = 1; r < bs; ++r) {
-    for (c = 0; c < bs - 2; c++)
-      dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
-}
-intra_pred_allsizes(d153)
-
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memcpy(dst, above, bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(v)
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) above;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset(dst, left[r], bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(h)
-
-static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  int ytop_left = above[-1];
-
-  for (r = 0; r < bs; r++) {
-    for (c = 0; c < bs; c++)
-      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(tm)
-
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) above;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset(dst, 128, bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(dc_128)
-
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void) above;
-
-  for (i = 0; i < bs; i++)
-    sum += left[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(dc_left)
-
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void) left;
-
-  for (i = 0; i < bs; i++)
-    sum += above[i];
-  expected_dc = (sum + (bs >> 1)) / bs;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(dc_top)
-
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                const uint8_t *above, const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  const int count = 2 * bs;
-
-  for (i = 0; i < bs; i++) {
-    sum += above[i];
-    sum += left[i];
-  }
-
-  expected_dc = (sum + (count >> 1)) / count;
-
-  for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
-    dst += stride;
-  }
-}
-intra_pred_allsizes(dc)
-#undef intra_pred_allsizes
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,       // DC
+  NEED_ABOVE,                   // V
+  NEED_LEFT,                    // H
+  NEED_ABOVERIGHT,              // D45
+  NEED_LEFT | NEED_ABOVE,       // D135
+  NEED_LEFT | NEED_ABOVE,       // D117
+  NEED_LEFT | NEED_ABOVE,       // D153
+  NEED_LEFT,                    // D207
+  NEED_ABOVERIGHT,              // D63
+  NEED_LEFT | NEED_ABOVE,       // TM
+};
 
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
 
-static intra_pred_fn pred[INTRA_MODES][4];
-static intra_pred_fn dc_pred[2][2][4];
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
 
-static void init_intra_pred_fn_ptrs(void) {
-#define intra_pred_allsizes(l, type) \
-  l[0] = vp9_##type##_predictor_4x4; \
-  l[1] = vp9_##type##_predictor_8x8; \
-  l[2] = vp9_##type##_predictor_16x16; \
-  l[3] = vp9_##type##_predictor_32x32
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][4];
+static intra_high_pred_fn dc_pred_high[2][2][4];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  intra_pred_allsizes(pred[V_PRED], v);
-  intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D207_PRED], d207);
-  intra_pred_allsizes(pred[D45_PRED], d45);
-  intra_pred_allsizes(pred[D63_PRED], d63);
-  intra_pred_allsizes(pred[D117_PRED], d117);
-  intra_pred_allsizes(pred[D135_PRED], d135);
-  intra_pred_allsizes(pred[D153_PRED], d153);
-  intra_pred_allsizes(pred[TM_PRED], tm);
+static void vp9_init_intra_predictors_internal(void) {
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vpx_##type##_predictor_4x4; \
+  p[TX_8X8] = vpx_##type##_predictor_8x8; \
+  p[TX_16X16] = vpx_##type##_predictor_16x16; \
+  p[TX_32X32] = vpx_##type##_predictor_32x32
 
-  intra_pred_allsizes(dc_pred[0][0], dc_128);
-  intra_pred_allsizes(dc_pred[0][1], dc_top);
-  intra_pred_allsizes(dc_pred[1][0], dc_left);
-  intra_pred_allsizes(dc_pred[1][1], dc);
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[D207_PRED], d207);
+  INIT_ALL_SIZES(pred[D45_PRED], d45);
+  INIT_ALL_SIZES(pred[D63_PRED], d63);
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
+
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
+  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
+
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #undef intra_pred_allsizes
 }
 
-static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
-                                   int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, TX_SIZE tx_size,
-                                   int up_available, int left_available,
-                                   int right_available, int x, int y,
-                                   int plane) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_intra_predictors_high(const MACROBLOCKD *xd,
+                                        const uint8_t *ref8,
+                                        int ref_stride,
+                                        uint8_t *dst8,
+                                        int dst_stride,
+                                        PREDICTION_MODE mode,
+                                        TX_SIZE tx_size,
+                                        int up_available,
+                                        int left_available,
+                                        int right_available,
+                                        int x, int y,
+                                        int plane, int bd) {
   int i;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
-  uint8_t *above_row = above_data + 16;
-  const uint8_t *const_above_row = above_row;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  uint16_t *above_row = above_data + 16;
+  const uint16_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
   int frame_width, frame_height;
   int x0, y0;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-
+  //  int base=128;
+  int base = 128 << (bd - 8);
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
-  // ..
-
-  once(init_intra_pred_fn_ptrs);
 
   // Get current frame pointer, width and height.
   if (plane == 0) {
@@ -358,8 +154,6 @@
   x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
   y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
 
-  vpx_memset(left_col, 129, 64);
-
   // left
   if (left_available) {
     if (xd->mb_to_bottom_edge < 0) {
@@ -379,60 +173,207 @@
       for (i = 0; i < bs; ++i)
         left_col[i] = ref[i * ref_stride - 1];
     }
+  } else {
+    // TODO(Peter): this value should probably change for high bitdepth
+    vpx_memset16(left_col, base + 1, bs);
   }
 
   // TODO(hkuang) do not extend 2*bs pixels for all modes.
   // above
   if (up_available) {
-    const uint8_t *above_ref = ref - ref_stride;
+    const uint16_t *above_ref = ref - ref_stride;
     if (xd->mb_to_right_edge < 0) {
       /* slower path if the block needs border extension */
       if (x0 + 2 * bs <= frame_width) {
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, 2 * bs);
+          memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
         } else {
-          vpx_memcpy(above_row, above_ref, bs);
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 + bs <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row, above_ref, bs);
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 <= frame_width) {
         const int r = frame_width - x0;
-        if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
-        } else {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
-        }
+        memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+        vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
       }
-      above_row[-1] = left_available ? above_ref[-1] : 129;
+      // TODO(Peter) this value should probably change for high bitdepth
+      above_row[-1] = left_available ? above_ref[-1] : (base+1);
     } else {
       /* faster path if the block does not need extension */
       if (bs == 4 && right_available && left_available) {
         const_above_row = above_ref;
       } else {
-        vpx_memcpy(above_row, above_ref, bs);
+        memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
         if (bs == 4 && right_available)
-          vpx_memcpy(above_row + bs, above_ref + bs, bs);
+          memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
         else
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
-        above_row[-1] = left_available ? above_ref[-1] : 129;
+          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+        // TODO(Peter): this value should probably change for high bitdepth
+        above_row[-1] = left_available ? above_ref[-1] : (base+1);
       }
     }
   } else {
-    vpx_memset(above_row, 127, bs * 2);
-    above_row[-1] = 127;
+    vpx_memset16(above_row, base - 1, bs * 2);
+    // TODO(Peter): this value should probably change for high bitdepth
+    above_row[-1] = base - 1;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
+                                                        const_above_row,
+                                                        left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+                             xd->bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   int up_available, int left_available,
+                                   int right_available, int x, int y,
+                                   int plane) {
+  int i;
+  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  // NEED_LEFT
+  if (extend_modes[mode] & NEED_LEFT) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+    } else {
+      memset(left_col, 129, bs);
+    }
+  }
+
+  // NEED_ABOVE
+  if (extend_modes[mode] & NEED_ABOVE) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs);
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs);
+      above_row[-1] = 127;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (extend_modes[mode] & NEED_ABOVERIGHT) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r);
+            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs);
+          else
+            memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs * 2);
+      above_row[-1] = 127;
+    }
   }
 
   // predict
@@ -444,20 +385,31 @@
   }
 }
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,
                              int aoff, int loff, int plane) {
-  const int bwl = bwl_in - tx_size;
-  const int wmask = (1 << bwl) - 1;
-  const int have_top = (block_idx >> bwl) || xd->up_available;
-  const int have_left = (block_idx & wmask) || xd->left_available;
-  const int have_right = ((block_idx & wmask) != wmask);
+  const int bw = (1 << bwl_in);
+  const int txw = (1 << tx_size);
+  const int have_top = loff || xd->up_available;
+  const int have_left = aoff || xd->left_available;
+  const int have_right = (aoff + txw) < bw;
   const int x = aoff * 4;
   const int y = loff * 4;
 
-  assert(bwl >= 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
+                                tx_size, have_top, have_left, have_right,
+                                x, y, plane, xd->bd);
+    return;
+  }
+#endif
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                          have_top, have_left, have_right, x, y, plane);
 }
+
+void vp9_init_intra_predictors(void) {
+  once(vp9_init_intra_predictors_internal);
+}

diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index d09d2a1..de45380 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h

@@ -18,7 +18,9 @@
 extern "C" {
 #endif
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_init_intra_predictors(void);
+
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,

diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c
index dc15a84..2dfa09f 100644
--- a/libvpx/vp9/common/vp9_rtcd.c
+++ b/libvpx/vp9/common/vp9_rtcd.c

@@ -12,9 +12,8 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd(void);
-
 void vp9_rtcd() {
-    vpx_scale_rtcd();
+    // TODO(JBB): Remove this once, by insuring that both the encoder and
+    // decoder setup functions are protected by once();
     once(setup_rtcd_internal);
 }

diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index 708f41b..737fc56 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl

@@ -5,6 +5,7 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
 
 struct macroblockd;
@@ -20,7 +21,12 @@
 }
 forward_decls qw/vp9_common_forward_decls/;
 
-# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
 if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
   $mmx_x86inc = 'mmx';
   $sse_x86inc = 'sse';
@@ -28,247 +34,32 @@
   $ssse3_x86inc = 'ssse3';
   $avx_x86inc = 'avx';
   $avx2_x86inc = 'avx2';
-} else {
-  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
-  $avx_x86inc = $avx2_x86inc = '';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
 }
 
-# this variable is for functions that are 64 bit only.
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
   $mmx_x86_64 = 'mmx';
   $sse2_x86_64 = 'sse2';
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
-} else {
-  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
-  $avx_x86_64 = $avx2_x86_64 = '';
 }
 
 #
-# RECON
-#
-add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
-
-add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d117_predictor_4x4/;
-
-add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_4x4/;
-
-add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
-
-add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
-$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
-
-add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
-
-add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/;
-
-add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/;
-
-add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/;
-
-add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
-
-add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d117_predictor_8x8/;
-
-add_proto qw/void vp9_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_8x8/;
-
-add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
-
-add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
-$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
-
-add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
-
-add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/;
-
-add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/;
-
-add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/;
-
-add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
-
-add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d117_predictor_16x16/;
-
-add_proto qw/void vp9_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_16x16/;
-
-add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
-
-add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
-
-add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
-
-add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/;
-
-add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16/;
-
-add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/;
-
-add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_32x32/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
-
-add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
-$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
-
-add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d117_predictor_32x32/;
-
-add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_32x32/;
-
-add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_32x32/;
-
-add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
-
-add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
-$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
-
-add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
-
-add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32/;
-
-add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32/;
-
-add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32/;
-
-#
-# Loopfilter
-#
-add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
-
-add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
-
-add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
-$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
-
-add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
-
-add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
-$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
-
-add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
-$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
-
-add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
-
-#
 # post proc
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down mmx sse2/;
+specialize qw/vp9_mbpost_proc_down sse2/;
 $vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
 
 add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
@@ -276,506 +67,227 @@
 $vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
 
 add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise mmx sse2/;
+specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 }
 
-add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_inner/;
-
-add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_outer/;
-
-add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_b/;
-
-#
-# Sub Pixel Filters
-#
-add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
-
-add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
-
-add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_neon_asm=vp9_convolve8_neon;
-
-add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
-
-add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
-
-add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
-
-add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
-
-add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
-
 #
 # dct
 #
-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht4x4_16_add/;
 
-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht8x8_64_add/;
 
-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/vp9_iht16x16_256_add/;
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add/;
 
-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add/;
 
-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add/;
+  } else {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
 
-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
 
-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+  }
+}
 
-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_copy/;
 
-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+  add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_avg/;
 
-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+  add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64";
 
-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+  add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64";
 
-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+  add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64";
 
-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+  add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64";
 
-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+  add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
 
-# dct and add
+  add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
 
-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_1_add/;
+  #
+  # post proc
+  #
+  if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_down/;
 
-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_16_add/;
+    add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_across_ip/;
+
+    add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_post_proc_down_and_across/;
+
+    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vp9_highbd_plane_add_noise/;
+  }
+
+  #
+  # dct
+  #
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht4x4_16_add/;
+
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht8x8_64_add/;
+
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht16x16_256_add/;
+}
 
 #
 # Encoder functions below this point.
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
+add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
+specialize qw/vp9_avg_8x8 sse2 neon msa/;
 
-# variance
-add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
+add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp9_avg_4x4 sse2 msa/;
 
-add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x32/, "$sse2_x86inc";
+add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+specialize qw/vp9_minmax_8x8 sse2/;
 
-add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
 
-add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
 
-add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
 
-add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
+add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+specialize qw/vp9_int_pro_row sse2 neon/;
 
-add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
+add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
+specialize qw/vp9_int_pro_col sse2 neon/;
 
-add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp9_vector_var neon sse2/;
 
-add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad64x64 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x64/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x8/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
-
-add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x3/;
-
-add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x3/;
-
-add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x3 sse3/;
-
-add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x3 sse3/;
-
-add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x3 sse3/;
-
-add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad64x64x8/;
-
-add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad32x32x8/;
-
-add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x16x8 sse4/;
-
-add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x16x8 sse4/;
-
-add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x4x8/;
-
-add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x8x8/;
-
-add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x4x8 sse4/;
-
-add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x64x4d sse2/;
-
-add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x4d sse2/;
-
-add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x4d sse2/;
-
-add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x4d sse2/;
-
-# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x4x4d sse2/;
-
-add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x8x4d sse/;
-
-add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x4d sse/;
-
-add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/;
-
-add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/;
-
-add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_8x8/;
+  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_4x4/;
+  add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vp9_highbd_minmax_8x8/;
+}
 
-add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss mmx sse2/;
 # ENCODEMB INVOKE
 
-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
-
-add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
-
-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
-
-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
-
 #
-# Structured Similarity (SSIM)
+# Denoiser
 #
-if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vp9_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp9_ssim_parms_8x8/, "$sse2_x86_64";
+if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
+  add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+  specialize qw/vp9_denoiser_filter sse2/;
+}
 
-    add_proto qw/void vp9_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp9_ssim_parms_16x16/, "$sse2_x86_64";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for  vp9_block_error can no longer be used.
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error/;
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp/;
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/;
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant/;
+} else {
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp neon/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 # fdct functions
-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2/;
 
-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2/;
 
-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2/;
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2/;
 
-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2/;
 
-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4_1 sse2/;
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+} else {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2 msa/;
 
-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2/;
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2 msa/;
 
-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2 neon/;
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2 msa/;
 
-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
-
-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16_1 sse2/;
-
-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2/;
-
-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_1 sse2/;
-
-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32 sse2 avx2/;
-
-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+}
 
 #
 # Motion search
@@ -785,9 +297,6 @@
 $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
-add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_refining_search_sad/;
-
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad/;
 
@@ -795,7 +304,39 @@
 specialize qw/vp9_full_range_search/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2/;
+specialize qw/vp9_temporal_filter_apply sse2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error sse2/;
+
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp/;
+
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp_32x32/;
+
+  # fdct functions
+  add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4/;
+
+  add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8/;
+
+  add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16/;
+
+  add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fwht4x4/;
+
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_highbd_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
 
 }
 # end encoder functions

diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c
index 2f58323..b763b92 100644
--- a/libvpx/vp9/common/vp9_scale.c
+++ b/libvpx/vp9/common/vp9_scale.c

@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_scale.h"
+#include "vpx_dsp/vpx_filter.h"
 
 static INLINE int scaled_x(int val, const struct scale_factors *sf) {
   return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
@@ -43,9 +44,16 @@
   return res;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_highbd) {
+#else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
+#endif
   if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
@@ -71,44 +79,97 @@
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
+
   if (sf->x_step_q4 == 16) {
     if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
-      sf->predict[0][0][0] = vp9_convolve_copy;
-      sf->predict[0][0][1] = vp9_convolve_avg;
-      sf->predict[0][1][0] = vp9_convolve8_vert;
-      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
-      sf->predict[1][0][0] = vp9_convolve8_horiz;
-      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vpx_convolve_copy;
+      sf->predict[0][0][1] = vpx_convolve_avg;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
     } else {
       // No scaling in x direction. Must always scale in the y direction.
-      sf->predict[0][0][0] = vp9_convolve8_vert;
-      sf->predict[0][0][1] = vp9_convolve8_avg_vert;
-      sf->predict[0][1][0] = vp9_convolve8_vert;
-      sf->predict[0][1][1] = vp9_convolve8_avg_vert;
-      sf->predict[1][0][0] = vp9_convolve8;
-      sf->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vpx_scaled_vert;
+      sf->predict[0][0][1] = vpx_scaled_avg_vert;
+      sf->predict[0][1][0] = vpx_scaled_vert;
+      sf->predict[0][1][1] = vpx_scaled_avg_vert;
+      sf->predict[1][0][0] = vpx_scaled_2d;
+      sf->predict[1][0][1] = vpx_scaled_avg_2d;
     }
   } else {
     if (sf->y_step_q4 == 16) {
       // No scaling in the y direction. Must always scale in the x direction.
-      sf->predict[0][0][0] = vp9_convolve8_horiz;
-      sf->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      sf->predict[0][1][0] = vp9_convolve8;
-      sf->predict[0][1][1] = vp9_convolve8_avg;
-      sf->predict[1][0][0] = vp9_convolve8_horiz;
-      sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+      sf->predict[0][0][0] = vpx_scaled_horiz;
+      sf->predict[0][0][1] = vpx_scaled_avg_horiz;
+      sf->predict[0][1][0] = vpx_scaled_2d;
+      sf->predict[0][1][1] = vpx_scaled_avg_2d;
+      sf->predict[1][0][0] = vpx_scaled_horiz;
+      sf->predict[1][0][1] = vpx_scaled_avg_horiz;
     } else {
       // Must always scale in both directions.
-      sf->predict[0][0][0] = vp9_convolve8;
-      sf->predict[0][0][1] = vp9_convolve8_avg;
-      sf->predict[0][1][0] = vp9_convolve8;
-      sf->predict[0][1][1] = vp9_convolve8_avg;
-      sf->predict[1][0][0] = vp9_convolve8;
-      sf->predict[1][0][1] = vp9_convolve8_avg;
+      sf->predict[0][0][0] = vpx_scaled_2d;
+      sf->predict[0][0][1] = vpx_scaled_avg_2d;
+      sf->predict[0][1][0] = vpx_scaled_2d;
+      sf->predict[0][1][1] = vpx_scaled_avg_2d;
+      sf->predict[1][0][0] = vpx_scaled_2d;
+      sf->predict[1][0][1] = vpx_scaled_avg_2d;
     }
   }
+
   // 2D subpel motion always gets filtered in both directions
-  sf->predict[1][1][0] = vp9_convolve8;
-  sf->predict[1][1][1] = vp9_convolve8_avg;
+
+  if ((sf->x_step_q4 != 16) || (sf->y_step_q4 != 16)) {
+    sf->predict[1][1][0] = vpx_scaled_2d;
+    sf->predict[1][1][1] = vpx_scaled_avg_2d;
+  } else {
+    sf->predict[1][1][0] = vpx_convolve8;
+    sf->predict[1][1][1] = vpx_convolve8_avg;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (use_highbd) {
+    if (sf->x_step_q4 == 16) {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in either direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // No scaling in x direction. Must always scale in the y direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    } else {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in the y direction. Must always scale in the x direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // Must always scale in both directions.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    }
+    // 2D subpel motion always gets filtered in both directions.
+    sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
+  }
+#endif
 }

diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h
index ad6f5d7..5e91041 100644
--- a/libvpx/vp9/common/vp9_scale.h
+++ b/libvpx/vp9/common/vp9_scale.h

@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_SCALE_H_
 
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_convolve.h"
+#include "vpx_dsp/vpx_convolve.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,13 +32,23 @@
   int (*scale_value_y)(int val, const struct scale_factors *sf);
 
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
+#endif
 };
 
 MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_high);
+#else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
+#endif
 
 static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&

diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c
index 1ec5a0c..d6fb8b2 100644
--- a/libvpx/vp9/common/vp9_scan.c
+++ b/libvpx/vp9/common/vp9_scan.c

@@ -233,37 +233,467 @@
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, static int16_t,
-                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
+  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
 
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static  int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]);
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
+  3, 10, 10, 7, 7, 11, 11, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
+  12, 10, 10, 13, 13, 14, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
+  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
+  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
+  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
+  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
+  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
+  47, 47, 55, 55, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
+  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
+  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
+  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
+  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
+  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
+  53, 54, 54, 61, 61, 62, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
+  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
+  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
+  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
+  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
+  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
+  46, 54, 61, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
+  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
+  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
+  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
+  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
+  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
+  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
+  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
+  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
+  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
+  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
+  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
+  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
+  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
+  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
+  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
+  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
+  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
+  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
+  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
+  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
+  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
+  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
+  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
+  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
+  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
+  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
+  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
+  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
+  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
+  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
+  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
+  223, 239, 239, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
+  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
+  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
+  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
+  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
+  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
+  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
+  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
+  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
+  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
+  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
+  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
+  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
+  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
+  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
+  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
+  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
+  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
+  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
+  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
+  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
+  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
+  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
+  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
+  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
+  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
+  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
+  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
+  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
+  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
+  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
+  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
+  238, 253, 253, 254, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
+  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
+  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
+  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
+  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
+  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
+  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
+  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
+  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
+  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
+  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
+  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
+  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
+  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
+  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
+  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
+  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
+  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
+  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
+  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
+  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
+  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
+  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
+  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
+  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
+  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
+  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
+  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
+  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
+  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
+  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
+  238, 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
+  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
+  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
+  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
+  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
+  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
+  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
+  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
+  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
+  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
+  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
+  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
+  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
+  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
+  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
+  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
+  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
+  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
+  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
+  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
+  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
+  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
+  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
+  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
+  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
+  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
+  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
+  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
+  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
+  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
+  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
+  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
+  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
+  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
+  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
+  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
+  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
+  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
+  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
+  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
+  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
+  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
+  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
+  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
+  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
+  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
+  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
+  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
+  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
+  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
+  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
+  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
+  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
+  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
+  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
+  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
+  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
+  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
+  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
+  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
+  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
+  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
+  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
+  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
+  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
+  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
+  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
+  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
+  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
+  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
+  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
+  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
+  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
+  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
+  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
+  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
+  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
+  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
+  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
+  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
+  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
+  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
+  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
+  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
+  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
+  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
+  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
+  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
+  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
+  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
+  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
+  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
+  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
+  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
+  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
+  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
+  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
+  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
+  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
+  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
+  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
+  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
+  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
+  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
+  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
+  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
+  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
+  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
+  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
+  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
+  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
+  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
+  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
+  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
+  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
+  0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
+  2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
+  6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60,
+  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
+  0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39,
+  6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
+  0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44,
+  3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53,
+  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
+  0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
+  1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
+  2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216,
+  3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218,
+  5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223,
+  7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228,
+  9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230,
+  13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235,
+  17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237,
+  22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+  27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+  33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+  42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+  50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+  57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
+  0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86,
+  3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, 115, 130,
+  8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, 119, 142, 167,
+  14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, 116, 135, 161, 185,
+  21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, 112, 133, 154, 179, 205,
+  28, 34, 39, 45, 50, 58, 67, 77, 87, 96, 106, 121, 146, 169, 196, 212,
+  41, 46, 49, 56, 63, 70, 79, 90, 98, 107, 122, 138, 159, 182, 207, 222,
+  52, 57, 62, 69, 75, 83, 93, 102, 110, 120, 134, 150, 176, 195, 215, 226,
+  66, 71, 78, 82, 91, 97, 108, 113, 127, 136, 148, 168, 188, 202, 221, 232,
+  80, 89, 92, 101, 105, 114, 125, 131, 139, 151, 162, 177, 192, 208, 223, 234,
+  95, 104, 109, 117, 123, 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239,
+  111, 118, 124, 129, 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240,
+  243, 126, 132, 137, 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237,
+  244, 246, 141, 149, 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238,
+  242, 249, 251, 152, 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236,
+  245, 247, 252, 253, 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235,
+  241, 248, 250, 254, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
+  0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179,
+  1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, 178, 196,
+  3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, 164, 186, 201,
+  6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, 153, 169, 193, 208,
+  10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, 133, 161, 176, 198, 214,
+  15, 21, 26, 34, 43, 52, 65, 77, 91, 106, 120, 140, 165, 185, 205, 221,
+  22, 27, 32, 41, 48, 60, 73, 85, 99, 116, 130, 151, 175, 190, 211, 225,
+  29, 35, 42, 49, 59, 69, 81, 95, 108, 125, 139, 155, 182, 197, 217, 229,
+  38, 45, 51, 61, 68, 80, 93, 105, 118, 134, 150, 168, 191, 207, 223, 234,
+  50, 56, 63, 74, 83, 94, 109, 117, 129, 147, 163, 177, 199, 213, 228, 238,
+  62, 70, 76, 87, 97, 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242,
+  75, 82, 90, 102, 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245,
+  89, 100, 111, 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250,
+  103, 115, 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248,
+  252, 121, 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244,
+  251, 254, 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247,
+  249, 253, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
+  0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
+  210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
+  527, 1, 4, 8, 15, 22, 30, 45, 58, 74, 92, 112, 133, 158, 184, 203, 215, 222,
+  228, 234, 237, 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551,
+  3, 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, 208, 217, 224,
+  231, 235, 238, 273, 297, 316, 329, 375, 403, 425, 440, 493, 525, 550, 567,
+  6, 11, 16, 23, 31, 43, 60, 73, 90, 109, 126, 150, 173, 196, 211, 220, 226,
+  232, 236, 239, 296, 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575,
+  9, 14, 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, 223, 244,
+  255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, 582, 596, 617, 645,
+  13, 20, 26, 35, 44, 54, 72, 85, 105, 123, 140, 163, 182, 205, 216, 225,
+  254, 271, 294, 314, 353, 373, 400, 423, 468, 491, 522, 548, 595, 616, 644,
+  666, 21, 27, 33, 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227,
+  270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, 643, 665,
+  680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, 159, 178, 197, 212, 221, 230,
+  292, 312, 326, 334, 398, 421, 437, 446, 520, 546, 564, 574, 642, 664, 679,
+  687, 34, 40, 46, 56, 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291,
+  340, 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, 723,
+  747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, 194, 252, 268, 290,
+  311, 351, 370, 396, 420, 466, 488, 518, 545, 593, 613, 640, 663, 704, 722,
+  746, 765, 51, 59, 66, 76, 89, 99, 119, 131, 149, 168, 181, 200, 267, 289,
+  310, 325, 369, 395, 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721,
+  745, 764, 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, 288,
+  309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, 661, 677, 686,
+  744, 763, 776, 783, 70, 79, 86, 97, 108, 122, 137, 155, 242, 251, 266, 287,
+  339, 350, 368, 393, 452, 465, 486, 515, 580, 592, 611, 637, 692, 703, 720,
+  743, 788, 798, 813, 833, 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286,
+  308, 349, 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719,
+  742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, 185, 264,
+  285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, 609, 635, 659, 676,
+  718, 741, 761, 775, 811, 831, 847, 858, 117, 128, 136, 148, 160, 175, 188,
+  198, 284, 306, 322, 332, 390, 415, 433, 444, 512, 540, 560, 572, 634, 658,
+  675, 685, 740, 760, 774, 782, 830, 846, 857, 863, 135, 146, 152, 165, 241,
+  249, 263, 283, 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633,
+  691, 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, 174,
+  183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, 539, 589, 607,
+  632, 657, 700, 716, 738, 759, 795, 809, 828, 845, 874, 886, 902, 915, 176,
+  187, 195, 202, 261, 281, 304, 321, 363, 387, 413, 432, 481, 509, 538, 559,
+  606, 631, 656, 674, 715, 737, 758, 773, 808, 827, 844, 856, 885, 901, 914,
+  923, 192, 199, 206, 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537,
+  558, 571, 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900,
+  913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, 480, 507,
+  578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, 825, 866, 873, 884,
+  899, 930, 936, 945, 957, 246, 259, 278, 302, 345, 361, 384, 411, 460, 479,
+  506, 536, 587, 604, 628, 654, 698, 713, 734, 756, 793, 806, 824, 842, 872,
+  883, 898, 912, 935, 944, 956, 966, 258, 277, 301, 319, 360, 383, 410, 430,
+  478, 505, 535, 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841,
+  854, 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, 409,
+  429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, 770, 780, 822,
+  840, 853, 861, 896, 910, 920, 926, 954, 964, 971, 975, 336, 344, 359, 381,
+  449, 459, 477, 503, 577, 586, 602, 625, 689, 697, 711, 731, 785, 792, 804,
+  821, 865, 871, 881, 895, 929, 934, 942, 953, 977, 981, 987, 995, 343, 358,
+  380, 408, 458, 476, 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791,
+  803, 820, 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001,
+  357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, 729, 752,
+  769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, 962, 970, 985, 993,
+  1000, 1005, 378, 406, 427, 441, 500, 531, 554, 569, 622, 649, 669, 682, 728,
+  751, 768, 779, 818, 837, 851, 860, 892, 907, 918, 925, 950, 961, 969, 974,
+  992, 999, 1004, 1007, 448, 457, 474, 499, 576, 584, 599, 621, 688, 695, 708,
+  727, 784, 790, 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979,
+  984, 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648,
+  694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, 938, 948,
+  960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, 529, 553, 597,
+  619, 647, 668, 706, 725, 749, 767, 799, 815, 835, 850, 876, 889, 905, 917,
+  937, 947, 959, 968, 982, 989, 997, 1003, 1011, 1015, 1019, 1022, 496, 528,
+  552, 568, 618, 646, 667, 681, 724, 748, 766, 778, 814, 834, 849, 859, 888,
+  904, 916, 924, 946, 958, 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021,
+  1023,
+};
 
 const scan_order vp9_default_scan_orders[TX_SIZES] = {
   {default_scan_4x4,   vp9_default_iscan_4x4,   default_scan_4x4_neighbors},
@@ -295,93 +725,3 @@
     {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
   }
 };
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
-  int n, l2 = l * l;
-  for (n = 0; n < l2; n++) {
-    int rc = scan[n];
-    if (rc == idx)
-      return  n;
-  }
-  assert(0);
-  return -1;
-}
-
-static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l,
-                                int16_t *neighbors) {
-  int l2 = l * l;
-  int n, i, j;
-
-  // dc doesn't use this type of prediction
-  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
-  iscan[0] = find_in_scan(scan, l, 0);
-  for (n = 1; n < l2; n++) {
-    int rc = scan[n];
-    iscan[n] = find_in_scan(scan, l, n);
-    i = rc / l;
-    j = rc % l;
-    if (i > 0 && j > 0) {
-      // col/row scan is used for adst/dct, and generally means that
-      // energy decreases to zero much faster in the dimension in
-      // which ADST is used compared to the direction in which DCT
-      // is used. Likewise, we find much higher correlation between
-      // coefficients within the direction in which DCT is used.
-      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
-      // as a context. If ADST or DCT is used in both directions, we
-      // use the combination of the two as a context.
-      int a = (i - 1) * l + j;
-      int b =  i      * l + j - 1;
-      if (scan == col_scan_4x4 || scan == col_scan_8x8 ||
-          scan == col_scan_16x16) {
-        // in the col/row scan cases (as well as left/top edge cases), we set
-        // both contexts to the same value, so we can branchlessly do a+b+1>>1
-        // which automatically becomes a if a == b
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == row_scan_4x4 || scan == row_scan_8x8 ||
-                 scan == row_scan_16x16) {
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      } else {
-        neighbors[MAX_NEIGHBORS * n + 0] = a;
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      }
-    } else if (i > 0) {
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
-    } else {
-      assert(j > 0);
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
-    }
-    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
-  }
-  // one padding item so we don't have to add branches in code to handle
-  // calls to get_coef_context() for the token after the final dc token
-  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      default_scan_4x4_neighbors);
-  init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      row_scan_4x4_neighbors);
-  init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      col_scan_4x4_neighbors);
-  init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      default_scan_8x8_neighbors);
-  init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      row_scan_8x8_neighbors);
-  init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      col_scan_8x8_neighbors);
-  init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      default_scan_16x16_neighbors);
-  init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      row_scan_16x16_neighbors);
-  init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      col_scan_16x16_neighbors);
-  init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      default_scan_32x32_neighbors);
-}

diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h
index 9613b67..1d86b5c 100644
--- a/libvpx/vp9/common/vp9_scan.h
+++ b/libvpx/vp9/common/vp9_scan.h

@@ -23,8 +23,6 @@
 
 #define MAX_NEIGHBORS 2
 
-void vp9_init_neighbors();
-
 typedef struct {
   const int16_t *scan;
   const int16_t *iscan;
@@ -40,6 +38,18 @@
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
+static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+    return &vp9_default_scan_orders[tx_size];
+  } else {
+    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
+    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c
index 910200e..c8ef618 100644
--- a/libvpx/vp9/common/vp9_seg_common.c
+++ b/libvpx/vp9/common/vp9_seg_common.c

@@ -25,12 +25,6 @@
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int vp9_segfeature_active(const struct segmentation *seg, int segment_id,
-                          SEG_LVL_FEATURES feature_id) {
-  return seg->enabled &&
-         (seg->feature_mask[segment_id] & (1 << feature_id));
-}
-
 void vp9_clearall_segfeatures(struct segmentation *seg) {
   vp9_zero(seg->feature_data);
   vp9_zero(seg->feature_mask);
@@ -60,13 +54,7 @@
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-int vp9_get_segdata(const struct segmentation *seg, int segment_id,
-                    SEG_LVL_FEATURES feature_id) {
-  return seg->feature_data[segment_id][feature_id];
-}
-
-
-const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };

diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index ff2d66a..5b75d8d 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,16 +42,19 @@
   uint8_t abs_delta;
   uint8_t temporal_update;
 
-  vp9_prob tree_probs[SEG_TREE_PROBS];
-  vp9_prob pred_probs[PREDICTION_PROBS];
+  vpx_prob tree_probs[SEG_TREE_PROBS];
+  vpx_prob pred_probs[PREDICTION_PROBS];
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
 };
 
-int vp9_segfeature_active(const struct segmentation *seg,
-                          int segment_id,
-                          SEG_LVL_FEATURES feature_id);
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
+}
 
 void vp9_clearall_segfeatures(struct segmentation *seg);
 
@@ -68,11 +71,12 @@
                      SEG_LVL_FEATURES feature_id,
                      int seg_data);
 
-int vp9_get_segdata(const struct segmentation *seg,
-                    int segment_id,
-                    SEG_LVL_FEATURES feature_id);
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
 
-extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c
new file mode 100644
index 0000000..6b11c93
--- /dev/null
+++ b/libvpx/vp9/common/vp9_thread_common.c

@@ -0,0 +1,436 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_loopfilter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked)
+    pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync)
+      sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP9_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
+  enum lf_path path;
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
+
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+  return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP9_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VPxWorker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = MIN(nworkers, tile_cols);
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Set up loopfilter thread data.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+                           int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      vpx_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      vpx_free(lf_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
+    vpx_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    vp9_zero(*lf_sync);
+  }
+}
+
+// Accumulate frame counts.
+void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                                 int is_dec) {
+  int i, j, k, l, m;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      cm->counts.partition[i][j] += counts->partition[i][j];
+
+  if (is_dec) {
+    int n;
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++) {
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                cm->counts.coef[i][j][k][l][m][n] +=
+                    counts->coef[i][j][k][l][m][n];
+            }
+  } else {
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++)
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+                // In the encoder, cm->counts.coef is only updated at frame
+                // level, so not need to accumulate it here.
+                // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                //   cm->counts.coef[i][j][k][l][m][n] +=
+                //       counts->coef[i][j][k][l][m][n];
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < TX_SIZES; i++)
+    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.skip[i][j] += counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    cm->counts.mv.joints[i] += counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *comps = &cm->counts.mv.comps[k];
+    nmv_component_counts *comps_t = &counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+}

diff --git a/libvpx/vp9/common/vp9_thread_common.h b/libvpx/vp9/common/vp9_thread_common.h
new file mode 100644
index 0000000..07af1bc
--- /dev/null
+++ b/libvpx/vp9/common/vp9_thread_common.h

@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_thread.h"
+
+struct VP9Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
+                           int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              struct VP9Common *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
+
+void vp9_accumulate_frame_counts(struct VP9Common *cm,
+                                 struct FRAME_COUNTS *counts, int is_dec);
+
+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_

diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 8c4a303..7a20e0a 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c

@@ -36,24 +36,24 @@
   vp9_tile_set_col(tile, cm, col);
 }
 
+static int get_min_log2_tile_cols(const int sb64_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+    ++min_log2;
+  return min_log2;
+}
+
+static int get_max_log2_tile_cols(const int sb64_cols) {
+  int max_log2 = 1;
+  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  return max_log2 - 1;
+}
+
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
-  int min_log2 = 0, max_log2 = 0;
-
-  // max
-  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
-    ++max_log2;
-  --max_log2;
-  if (max_log2 < 0)
-    max_log2 = 0;
-
-  // min
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
-    ++min_log2;
-
-  assert(min_log2 <= max_log2);
-
-  *min_log2_tile_cols = min_log2;
-  *max_log2_tile_cols = max_log2;
+  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }

diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
deleted file mode 100644
index 1b4904c..0000000
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ /dev/null

@@ -1,339 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
-  const unsigned char *src_ptr,
-  const ptrdiff_t src_pitch,
-  unsigned char *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const short *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                   uint8_t *dst, ptrdiff_t dst_stride, \
-                                   const int16_t *filter_x, int x_step_q4, \
-                                   const int16_t *filter_y, int y_step_q4, \
-                                   int w, int h) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    if (filter[0] || filter[1] || filter[2]) { \
-      while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                 src_stride, \
-                                                 dst, \
-                                                 dst_stride, \
-                                                 h, \
-                                                 filter); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                 src_stride, \
-                                                 dst, \
-                                                 dst_stride, \
-                                                 h, \
-                                                 filter); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
-                             filter_x, x_step_q4, filter_y, y_step_q4, \
-                             w, h); \
-  } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                              uint8_t *dst, ptrdiff_t dst_stride, \
-                              const int16_t *filter_x, int x_step_q4, \
-                              const int16_t *filter_y, int y_step_q4, \
-                              int w, int h) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
-      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, \
-                                w, h + 7); \
-      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
-                                      filter_x, x_step_q4, filter_y, \
-                                      y_step_q4, w, h); \
-    } else { \
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
-      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, \
-                                w, h + 1); \
-      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
-                                      filter_x, x_step_q4, filter_y, \
-                                      y_step_q4, w, h); \
-    } \
-  } else { \
-    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
-  } \
-}
-#if HAVE_AVX2
-filter8_1dfunction vp9_filter_block1d16_v8_avx2;
-filter8_1dfunction vp9_filter_block1d16_h8_avx2;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
-#endif
-#if HAVE_SSSE3
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
-#endif
-
-#if HAVE_SSE2
-filter8_1dfunction vp9_filter_block1d16_v8_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_sse2;
-filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
-
-// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
-#endif

diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index b60f8a0..4a16345 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -8,257 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
-{                                                     \
-  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-  d0 = _mm_unpacklo_epi8(d0, zero); \
-  d0 = _mm_add_epi16(in_x, d0); \
-  d0 = _mm_packus_epi16(d0, d0); \
-  *(int *)dest = _mm_cvtsi128_si32(d0); \
-  dest += stride; \
-}
-
-void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = _mm_load_si128((const __m128i *)input);
-  input2 = _mm_load_si128((const __m128i *)(input + 8));
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)), d2);
-     d0 = _mm_unpacklo_epi8(d0, zero);
-     d2 = _mm_unpacklo_epi8(d2, zero);
-     d0 = _mm_add_epi16(d0, input2);
-     d2 = _mm_add_epi16(d2, input3);
-     d0 = _mm_packus_epi16(d0, d2);
-     // store input0
-     *(int *)dest = _mm_cvtsi128_si32(d0);
-     // store input1
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-     // store input2
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-     // store input3
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-static void idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-static void iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
 
 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
@@ -266,8 +18,8 @@
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0]= _mm_loadu_si128((const __m128i *)(input));
-  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
+  in[0] = _mm_loadu_si128((const __m128i *)(input));
+  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -300,568 +52,36 @@
 
   // Reconstruction and Store
   {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)));
-     d0 = _mm_unpacklo_epi8(d0, zero);
-     d2 = _mm_unpacklo_epi8(d2, zero);
-     d0 = _mm_add_epi16(d0, in[0]);
-     d2 = _mm_add_epi16(d2, in[1]);
-     d0 = _mm_packus_epi16(d0, d2);
-     // store result[0]
-     *(int *)dest = _mm_cvtsi128_si32(d0);
-     // store result[1]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-     // store result[2]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-     // store result[3]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
   }
 }
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                      out0, out1, out2, out3, out4, out5, out6, out7) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
-                                                        \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
-                                                            \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
-                         out0, out1, out2, out3) \
-  {                                              \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
-    \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                            \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      tmp4 = _mm_madd_epi16(lo_1, cst2); \
-      tmp5 = _mm_madd_epi16(hi_1, cst2); \
-      tmp6 = _mm_madd_epi16(lo_1, cst3); \
-      tmp7 = _mm_madd_epi16(hi_1, cst3); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      tmp4 = _mm_add_epi32(tmp4, rounding); \
-      tmp5 = _mm_add_epi32(tmp5, rounding); \
-      tmp6 = _mm_add_epi32(tmp6, rounding); \
-      tmp7 = _mm_add_epi32(tmp7, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-      res2 = _mm_packs_epi32(tmp4, tmp5); \
-      res3 = _mm_packs_epi32(tmp6, tmp7); \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                 out0, out1, out2, out3, out4, out5, out6, out7)  \
-  { \
-  /* Stage1 */      \
-  { \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
-    \
-    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
-                          stg1_1, stg1_2, stg1_3, stp1_4,      \
-                          stp1_7, stp1_5, stp1_6)              \
-  } \
-    \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
-                           stg2_1, stg2_2, stg2_3, stp2_0,     \
-                           stp2_1, stp2_2, stp2_3)             \
-    \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  } \
-  \
-  /* Stage4  */ \
-  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
-  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
-  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
-  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
-  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
-  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
-  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
-  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
-  }
-
-void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
-                  in0, in1, in2, in3, in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-             in0, in1, in2, in3, in4, in5, in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest, in0);
-  RECON_AND_STORE(dest, in1);
-  RECON_AND_STORE(dest, in2);
-  RECON_AND_STORE(dest, in3);
-  RECON_AND_STORE(dest, in4);
-  RECON_AND_STORE(dest, in5);
-  RECON_AND_STORE(dest, in6);
-  RECON_AND_STORE(dest, in7);
-}
-
-void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-}
-
-static void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
-                in0, in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
-}
-
-static void iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0  = in[7];
-  in1  = in[0];
-  in2  = in[5];
-  in3  = in[2];
-  in4  = in[3];
-  in5  = in[4];
-  in6  = in[1];
-  in7  = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-
 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
   in[0] = _mm_load_si128((const __m128i *)input);
@@ -914,1418 +134,14 @@
   in[6] = _mm_srai_epi16(in[6], 5);
   in[7] = _mm_srai_epi16(in[7], 5);
 
-  RECON_AND_STORE(dest, in[0]);
-  RECON_AND_STORE(dest, in[1]);
-  RECON_AND_STORE(dest, in[2]);
-  RECON_AND_STORE(dest, in[3]);
-  RECON_AND_STORE(dest, in[4]);
-  RECON_AND_STORE(dest, in[5]);
-  RECON_AND_STORE(dest, in[6]);
-  RECON_AND_STORE(dest, in[7]);
-}
-
-void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  { //NOLINT
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  { //NOLINT
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  { //NOLINT
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
-           in0, in1, in2, in3, in4, in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest, in0);
-  RECON_AND_STORE(dest, in1);
-  RECON_AND_STORE(dest, in2);
-  RECON_AND_STORE(dest, in3);
-  RECON_AND_STORE(dest, in4);
-  RECON_AND_STORE(dest, in5);
-  RECON_AND_STORE(dest, in6);
-  RECON_AND_STORE(dest, in7);
-}
-
-#define IDCT16 \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
-                           stg2_0, stg2_1, stg2_2, stg2_3, \
-                           stp2_8, stp2_15, stp2_9, stp2_14) \
-    \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
-                           stg2_4, stg2_5, stg2_6, stg2_7, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
-                           stg3_0, stg3_1, stg3_2, stg3_3, \
-                           stp1_4, stp1_7, stp1_5, stp1_6) \
-    \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-    \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  } \
-  \
-  /* Stage4 */ \
-  { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
-    \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
-                           stg4_0, stg4_1, stg4_2, stg4_3, \
-                           stp2_0, stp2_1, stp2_2, stp2_3) \
-    \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                           stg4_4, stg4_5, stg4_6, stg4_7, \
-                           stp2_9, stp2_14, stp2_10, stp2_13) \
-  } \
-    \
-  /* Stage5 */ \
-  { \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-    \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-    \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  } \
-    \
-  /* Stage6 */ \
-  { \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-    \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-    \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                           stg6_0, stg4_0, stg6_0, stg4_0, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  }
-
-#define IDCT16_10 \
-    /* Stage2 */ \
-    { \
-      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
-      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
-      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
-      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
-      \
-      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
-                             stg2_0, stg2_1, stg2_6, stg2_7, \
-                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
-    } \
-      \
-    /* Stage3 */ \
-    { \
-      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
-      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
-                               stg3_0, stg3_1,  \
-                               stp2_4, stp2_7) \
-      \
-      stp1_9  =  stp1_8_0; \
-      stp1_10 =  stp1_11;  \
-      \
-      stp1_13 = stp1_12_0; \
-      stp1_14 = stp1_15;   \
-    } \
-    \
-    /* Stage4 */ \
-    { \
-      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
-      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
-      \
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
-                               stg4_0, stg4_1, \
-                               stp1_0, stp1_1) \
-      stp2_5 = stp2_4; \
-      stp2_6 = stp2_7; \
-      \
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                             stg4_4, stg4_5, stg4_6, stg4_7, \
-                             stp2_9, stp2_14, stp2_10, stp2_13) \
-    } \
-      \
-    /* Stage5 */ \
-    { \
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-      \
-      stp1_2 = stp1_1; \
-      stp1_3 = stp1_0; \
-      \
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-      \
-      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-      \
-      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-    } \
-      \
-    /* Stage6 */ \
-    { \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-      \
-      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-      \
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                             stg6_0, stg4_0, stg6_0, stg4_0, \
-                             stp2_10, stp2_13, stp2_11, stp2_12) \
-    }
-
-void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-      // 1-D idct
-
-      // Load input data.
-      in[0] = _mm_load_si128((const __m128i *)input);
-      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
-      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
-      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
-      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
-      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
-      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
-      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
-      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
-      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in+8, in+8);
-
-      IDCT16
-
-      // Stage7
-      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-      curr1 = r;
-      input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-      // 1-D idct
-      array_transpose_8x8(l+i*8, in);
-      array_transpose_8x8(r+i*8, in+8);
-
-      IDCT16
-
-      // 2-D
-      in[0] = _mm_add_epi16(stp2_0, stp1_15);
-      in[1] = _mm_add_epi16(stp2_1, stp1_14);
-      in[2] = _mm_add_epi16(stp2_2, stp2_13);
-      in[3] = _mm_add_epi16(stp2_3, stp2_12);
-      in[4] = _mm_add_epi16(stp2_4, stp2_11);
-      in[5] = _mm_add_epi16(stp2_5, stp2_10);
-      in[6] = _mm_add_epi16(stp2_6, stp1_9);
-      in[7] = _mm_add_epi16(stp2_7, stp1_8);
-      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-      // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-
-      dest += 8 - (stride * 16);
-  }
-}
-
-void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    dest += 8 - (stride * 16);
-  }
-}
-
-static void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-static void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-static void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
@@ -2362,1633 +178,3 @@
   dest += 8;
   write_buffer_8x16(dest, in1, stride);
 }
-
-void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    array_transpose_4X8(l + 8*i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    // Final rounding and shift
-    in[0] = _mm_adds_epi16(in[0], final_rounding);
-    in[1] = _mm_adds_epi16(in[1], final_rounding);
-    in[2] = _mm_adds_epi16(in[2], final_rounding);
-    in[3] = _mm_adds_epi16(in[3], final_rounding);
-    in[4] = _mm_adds_epi16(in[4], final_rounding);
-    in[5] = _mm_adds_epi16(in[5], final_rounding);
-    in[6] = _mm_adds_epi16(in[6], final_rounding);
-    in[7] = _mm_adds_epi16(in[7], final_rounding);
-    in[8] = _mm_adds_epi16(in[8], final_rounding);
-    in[9] = _mm_adds_epi16(in[9], final_rounding);
-    in[10] = _mm_adds_epi16(in[10], final_rounding);
-    in[11] = _mm_adds_epi16(in[11], final_rounding);
-    in[12] = _mm_adds_epi16(in[12], final_rounding);
-    in[13] = _mm_adds_epi16(in[13], final_rounding);
-    in[14] = _mm_adds_epi16(in[14], final_rounding);
-    in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-    in[0] = _mm_srai_epi16(in[0], 6);
-    in[1] = _mm_srai_epi16(in[1], 6);
-    in[2] = _mm_srai_epi16(in[2], 6);
-    in[3] = _mm_srai_epi16(in[3], 6);
-    in[4] = _mm_srai_epi16(in[4], 6);
-    in[5] = _mm_srai_epi16(in[5], 6);
-    in[6] = _mm_srai_epi16(in[6], 6);
-    in[7] = _mm_srai_epi16(in[7], 6);
-    in[8] = _mm_srai_epi16(in[8], 6);
-    in[9] = _mm_srai_epi16(in[9], 6);
-    in[10] = _mm_srai_epi16(in[10], 6);
-    in[11] = _mm_srai_epi16(in[11], 6);
-    in[12] = _mm_srai_epi16(in[12], 6);
-    in[13] = _mm_srai_epi16(in[13], 6);
-    in[14] = _mm_srai_epi16(in[14], 6);
-    in[15] = _mm_srai_epi16(in[15], 6);
-
-    RECON_AND_STORE(dest, in[0]);
-    RECON_AND_STORE(dest, in[1]);
-    RECON_AND_STORE(dest, in[2]);
-    RECON_AND_STORE(dest, in[3]);
-    RECON_AND_STORE(dest, in[4]);
-    RECON_AND_STORE(dest, in[5]);
-    RECON_AND_STORE(dest, in[6]);
-    RECON_AND_STORE(dest, in[7]);
-    RECON_AND_STORE(dest, in[8]);
-    RECON_AND_STORE(dest, in[9]);
-    RECON_AND_STORE(dest, in[10]);
-    RECON_AND_STORE(dest, in[11]);
-    RECON_AND_STORE(dest, in[12]);
-    RECON_AND_STORE(dest, in[13]);
-    RECON_AND_STORE(dest, in[14]);
-    RECON_AND_STORE(dest, in[15]);
-
-    dest += 8 - (stride * 16);
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
-  {  \
-    reg = _mm_load_si128((const __m128i *) input); \
-    input += 8; \
-  }  \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
-  \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
-  \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
-                         stg1_1, stp1_16, stp1_31); \
-  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
-                         stg1_7, stp1_19, stp1_28); \
-  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
-                         stg1_9, stp1_20, stp1_27); \
-  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
-                         stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
-  \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
-                         stg2_1, stp2_8, stp2_15); \
-  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
-                         stg2_7, stp2_11, stp2_12); \
-  \
-  stp2_16 = stp1_16; \
-  stp2_19 = stp1_19; \
-  \
-  stp2_20 = stp1_20; \
-  stp2_23 = stp1_23; \
-  \
-  stp2_24 = stp1_24; \
-  stp2_27 = stp1_27; \
-  \
-  stp2_28 = stp1_28; \
-  stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
-                         stg3_1, stp1_4, stp1_7); \
-  \
-  stp1_8 = stp2_8; \
-  stp1_11 = stp2_11; \
-  stp1_12 = stp2_12; \
-  stp1_15 = stp2_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
-                         stg4_1, stp2_0, stp2_1); \
-  \
-  stp2_4 = stp1_4; \
-  stp2_5 = stp1_4; \
-  stp2_6 = stp1_7; \
-  stp2_7 = stp1_7; \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = stp2_0; \
-  stp1_1 = stp2_1; \
-  stp1_2 = stp2_1; \
-  stp1_3 = stp2_0; \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-
-#define IDCT32 \
-/* Stage1 */ \
-{ \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
-                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
-                         stp1_17, stp1_30) \
-  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
-                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
-                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
-                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
-  \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
-                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
-                         stp2_14) \
-  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
-                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
-                         stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
-  \
-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
-  \
-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  \
-  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
-                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
-                         stp1_6) \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  \
-  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
-                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
-                         stp2_2, stp2_3) \
-  \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // Load input data.
-  LOAD_DQCOEFF(in[0], input);
-  LOAD_DQCOEFF(in[8], input);
-  LOAD_DQCOEFF(in[16], input);
-  LOAD_DQCOEFF(in[24], input);
-  LOAD_DQCOEFF(in[1], input);
-  LOAD_DQCOEFF(in[9], input);
-  LOAD_DQCOEFF(in[17], input);
-  LOAD_DQCOEFF(in[25], input);
-  LOAD_DQCOEFF(in[2], input);
-  LOAD_DQCOEFF(in[10], input);
-  LOAD_DQCOEFF(in[18], input);
-  LOAD_DQCOEFF(in[26], input);
-  LOAD_DQCOEFF(in[3], input);
-  LOAD_DQCOEFF(in[11], input);
-  LOAD_DQCOEFF(in[19], input);
-  LOAD_DQCOEFF(in[27], input);
-
-  LOAD_DQCOEFF(in[4], input);
-  LOAD_DQCOEFF(in[12], input);
-  LOAD_DQCOEFF(in[20], input);
-  LOAD_DQCOEFF(in[28], input);
-  LOAD_DQCOEFF(in[5], input);
-  LOAD_DQCOEFF(in[13], input);
-  LOAD_DQCOEFF(in[21], input);
-  LOAD_DQCOEFF(in[29], input);
-  LOAD_DQCOEFF(in[6], input);
-  LOAD_DQCOEFF(in[14], input);
-  LOAD_DQCOEFF(in[22], input);
-  LOAD_DQCOEFF(in[30], input);
-  LOAD_DQCOEFF(in[7], input);
-  LOAD_DQCOEFF(in[15], input);
-  LOAD_DQCOEFF(in[23], input);
-  LOAD_DQCOEFF(in[31], input);
-
-  array_transpose_8x8(in, in);
-  array_transpose_8x8(in+8, in+8);
-  array_transpose_8x8(in+16, in+16);
-  array_transpose_8x8(in+24, in+24);
-
-  IDCT32
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-      const __m128i zero = _mm_setzero_si128();
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(col+i*8, in);
-      IDCT32_34
-
-      // 2_D: Calculate the results and store them to destination.
-      in[0] = _mm_add_epi16(stp1_0, stp1_31);
-      in[1] = _mm_add_epi16(stp1_1, stp1_30);
-      in[2] = _mm_add_epi16(stp1_2, stp1_29);
-      in[3] = _mm_add_epi16(stp1_3, stp1_28);
-      in[4] = _mm_add_epi16(stp1_4, stp1_27);
-      in[5] = _mm_add_epi16(stp1_5, stp1_26);
-      in[6] = _mm_add_epi16(stp1_6, stp1_25);
-      in[7] = _mm_add_epi16(stp1_7, stp1_24);
-      in[8] = _mm_add_epi16(stp1_8, stp1_23);
-      in[9] = _mm_add_epi16(stp1_9, stp1_22);
-      in[10] = _mm_add_epi16(stp1_10, stp1_21);
-      in[11] = _mm_add_epi16(stp1_11, stp1_20);
-      in[12] = _mm_add_epi16(stp1_12, stp1_19);
-      in[13] = _mm_add_epi16(stp1_13, stp1_18);
-      in[14] = _mm_add_epi16(stp1_14, stp1_17);
-      in[15] = _mm_add_epi16(stp1_15, stp1_16);
-      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-      // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-      in[16] = _mm_adds_epi16(in[16], final_rounding);
-      in[17] = _mm_adds_epi16(in[17], final_rounding);
-      in[18] = _mm_adds_epi16(in[18], final_rounding);
-      in[19] = _mm_adds_epi16(in[19], final_rounding);
-      in[20] = _mm_adds_epi16(in[20], final_rounding);
-      in[21] = _mm_adds_epi16(in[21], final_rounding);
-      in[22] = _mm_adds_epi16(in[22], final_rounding);
-      in[23] = _mm_adds_epi16(in[23], final_rounding);
-      in[24] = _mm_adds_epi16(in[24], final_rounding);
-      in[25] = _mm_adds_epi16(in[25], final_rounding);
-      in[26] = _mm_adds_epi16(in[26], final_rounding);
-      in[27] = _mm_adds_epi16(in[27], final_rounding);
-      in[28] = _mm_adds_epi16(in[28], final_rounding);
-      in[29] = _mm_adds_epi16(in[29], final_rounding);
-      in[30] = _mm_adds_epi16(in[30], final_rounding);
-      in[31] = _mm_adds_epi16(in[31], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-      in[16] = _mm_srai_epi16(in[16], 6);
-      in[17] = _mm_srai_epi16(in[17], 6);
-      in[18] = _mm_srai_epi16(in[18], 6);
-      in[19] = _mm_srai_epi16(in[19], 6);
-      in[20] = _mm_srai_epi16(in[20], 6);
-      in[21] = _mm_srai_epi16(in[21], 6);
-      in[22] = _mm_srai_epi16(in[22], 6);
-      in[23] = _mm_srai_epi16(in[23], 6);
-      in[24] = _mm_srai_epi16(in[24], 6);
-      in[25] = _mm_srai_epi16(in[25], 6);
-      in[26] = _mm_srai_epi16(in[26], 6);
-      in[27] = _mm_srai_epi16(in[27], 6);
-      in[28] = _mm_srai_epi16(in[28], 6);
-      in[29] = _mm_srai_epi16(in[29], 6);
-      in[30] = _mm_srai_epi16(in[30], 6);
-      in[31] = _mm_srai_epi16(in[31], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-      RECON_AND_STORE(dest, in[16]);
-      RECON_AND_STORE(dest, in[17]);
-      RECON_AND_STORE(dest, in[18]);
-      RECON_AND_STORE(dest, in[19]);
-      RECON_AND_STORE(dest, in[20]);
-      RECON_AND_STORE(dest, in[21]);
-      RECON_AND_STORE(dest, in[22]);
-      RECON_AND_STORE(dest, in[23]);
-      RECON_AND_STORE(dest, in[24]);
-      RECON_AND_STORE(dest, in[25]);
-      RECON_AND_STORE(dest, in[26]);
-      RECON_AND_STORE(dest, in[27]);
-      RECON_AND_STORE(dest, in[28]);
-      RECON_AND_STORE(dest, in[29]);
-      RECON_AND_STORE(dest, in[30]);
-      RECON_AND_STORE(dest, in[31]);
-
-      dest += 8 - (stride * 32);
-    }
-  }
-
-void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-  int zero_flag[2];
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-      // First 1-D idct
-      // Load input data.
-      LOAD_DQCOEFF(in[0], input);
-      LOAD_DQCOEFF(in[8], input);
-      LOAD_DQCOEFF(in[16], input);
-      LOAD_DQCOEFF(in[24], input);
-      LOAD_DQCOEFF(in[1], input);
-      LOAD_DQCOEFF(in[9], input);
-      LOAD_DQCOEFF(in[17], input);
-      LOAD_DQCOEFF(in[25], input);
-      LOAD_DQCOEFF(in[2], input);
-      LOAD_DQCOEFF(in[10], input);
-      LOAD_DQCOEFF(in[18], input);
-      LOAD_DQCOEFF(in[26], input);
-      LOAD_DQCOEFF(in[3], input);
-      LOAD_DQCOEFF(in[11], input);
-      LOAD_DQCOEFF(in[19], input);
-      LOAD_DQCOEFF(in[27], input);
-
-      LOAD_DQCOEFF(in[4], input);
-      LOAD_DQCOEFF(in[12], input);
-      LOAD_DQCOEFF(in[20], input);
-      LOAD_DQCOEFF(in[28], input);
-      LOAD_DQCOEFF(in[5], input);
-      LOAD_DQCOEFF(in[13], input);
-      LOAD_DQCOEFF(in[21], input);
-      LOAD_DQCOEFF(in[29], input);
-      LOAD_DQCOEFF(in[6], input);
-      LOAD_DQCOEFF(in[14], input);
-      LOAD_DQCOEFF(in[22], input);
-      LOAD_DQCOEFF(in[30], input);
-      LOAD_DQCOEFF(in[7], input);
-      LOAD_DQCOEFF(in[15], input);
-      LOAD_DQCOEFF(in[23], input);
-      LOAD_DQCOEFF(in[31], input);
-
-      // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in[0], in[1]);
-      zero_idx[1] = _mm_or_si128(in[2], in[3]);
-      zero_idx[2] = _mm_or_si128(in[4], in[5]);
-      zero_idx[3] = _mm_or_si128(in[6], in[7]);
-      zero_idx[4] = _mm_or_si128(in[8], in[9]);
-      zero_idx[5] = _mm_or_si128(in[10], in[11]);
-      zero_idx[6] = _mm_or_si128(in[12], in[13]);
-      zero_idx[7] = _mm_or_si128(in[14], in[15]);
-      zero_idx[8] = _mm_or_si128(in[16], in[17]);
-      zero_idx[9] = _mm_or_si128(in[18], in[19]);
-      zero_idx[10] = _mm_or_si128(in[20], in[21]);
-      zero_idx[11] = _mm_or_si128(in[22], in[23]);
-      zero_idx[12] = _mm_or_si128(in[24], in[25]);
-      zero_idx[13] = _mm_or_si128(in[26], in[27]);
-      zero_idx[14] = _mm_or_si128(in[28], in[29]);
-      zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
-      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
-      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
-      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
-      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
-
-      if (!zero_flag[0] && !zero_flag[1]) {
-        col[i32 + 0] = _mm_setzero_si128();
-        col[i32 + 1] = _mm_setzero_si128();
-        col[i32 + 2] = _mm_setzero_si128();
-        col[i32 + 3] = _mm_setzero_si128();
-        col[i32 + 4] = _mm_setzero_si128();
-        col[i32 + 5] = _mm_setzero_si128();
-        col[i32 + 6] = _mm_setzero_si128();
-        col[i32 + 7] = _mm_setzero_si128();
-        col[i32 + 8] = _mm_setzero_si128();
-        col[i32 + 9] = _mm_setzero_si128();
-        col[i32 + 10] = _mm_setzero_si128();
-        col[i32 + 11] = _mm_setzero_si128();
-        col[i32 + 12] = _mm_setzero_si128();
-        col[i32 + 13] = _mm_setzero_si128();
-        col[i32 + 14] = _mm_setzero_si128();
-        col[i32 + 15] = _mm_setzero_si128();
-        col[i32 + 16] = _mm_setzero_si128();
-        col[i32 + 17] = _mm_setzero_si128();
-        col[i32 + 18] = _mm_setzero_si128();
-        col[i32 + 19] = _mm_setzero_si128();
-        col[i32 + 20] = _mm_setzero_si128();
-        col[i32 + 21] = _mm_setzero_si128();
-        col[i32 + 22] = _mm_setzero_si128();
-        col[i32 + 23] = _mm_setzero_si128();
-        col[i32 + 24] = _mm_setzero_si128();
-        col[i32 + 25] = _mm_setzero_si128();
-        col[i32 + 26] = _mm_setzero_si128();
-        col[i32 + 27] = _mm_setzero_si128();
-        col[i32 + 28] = _mm_setzero_si128();
-        col[i32 + 29] = _mm_setzero_si128();
-        col[i32 + 30] = _mm_setzero_si128();
-        col[i32 + 31] = _mm_setzero_si128();
-        continue;
-      }
-
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in+8, in+8);
-      array_transpose_8x8(in+16, in+16);
-      array_transpose_8x8(in+24, in+24);
-
-      IDCT32
-
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    }
-  for (i = 0; i < 4; i++) {
-      const __m128i zero = _mm_setzero_si128();
-      // Second 1-D idct
-      j = i << 3;
-
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(col+j, in);
-      array_transpose_8x8(col+j+32, in+8);
-      array_transpose_8x8(col+j+64, in+16);
-      array_transpose_8x8(col+j+96, in+24);
-
-      IDCT32
-
-      // 2_D: Calculate the results and store them to destination.
-      in[0] = _mm_add_epi16(stp1_0, stp1_31);
-      in[1] = _mm_add_epi16(stp1_1, stp1_30);
-      in[2] = _mm_add_epi16(stp1_2, stp1_29);
-      in[3] = _mm_add_epi16(stp1_3, stp1_28);
-      in[4] = _mm_add_epi16(stp1_4, stp1_27);
-      in[5] = _mm_add_epi16(stp1_5, stp1_26);
-      in[6] = _mm_add_epi16(stp1_6, stp1_25);
-      in[7] = _mm_add_epi16(stp1_7, stp1_24);
-      in[8] = _mm_add_epi16(stp1_8, stp1_23);
-      in[9] = _mm_add_epi16(stp1_9, stp1_22);
-      in[10] = _mm_add_epi16(stp1_10, stp1_21);
-      in[11] = _mm_add_epi16(stp1_11, stp1_20);
-      in[12] = _mm_add_epi16(stp1_12, stp1_19);
-      in[13] = _mm_add_epi16(stp1_13, stp1_18);
-      in[14] = _mm_add_epi16(stp1_14, stp1_17);
-      in[15] = _mm_add_epi16(stp1_15, stp1_16);
-      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-      // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-      in[16] = _mm_adds_epi16(in[16], final_rounding);
-      in[17] = _mm_adds_epi16(in[17], final_rounding);
-      in[18] = _mm_adds_epi16(in[18], final_rounding);
-      in[19] = _mm_adds_epi16(in[19], final_rounding);
-      in[20] = _mm_adds_epi16(in[20], final_rounding);
-      in[21] = _mm_adds_epi16(in[21], final_rounding);
-      in[22] = _mm_adds_epi16(in[22], final_rounding);
-      in[23] = _mm_adds_epi16(in[23], final_rounding);
-      in[24] = _mm_adds_epi16(in[24], final_rounding);
-      in[25] = _mm_adds_epi16(in[25], final_rounding);
-      in[26] = _mm_adds_epi16(in[26], final_rounding);
-      in[27] = _mm_adds_epi16(in[27], final_rounding);
-      in[28] = _mm_adds_epi16(in[28], final_rounding);
-      in[29] = _mm_adds_epi16(in[29], final_rounding);
-      in[30] = _mm_adds_epi16(in[30], final_rounding);
-      in[31] = _mm_adds_epi16(in[31], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-      in[16] = _mm_srai_epi16(in[16], 6);
-      in[17] = _mm_srai_epi16(in[17], 6);
-      in[18] = _mm_srai_epi16(in[18], 6);
-      in[19] = _mm_srai_epi16(in[19], 6);
-      in[20] = _mm_srai_epi16(in[20], 6);
-      in[21] = _mm_srai_epi16(in[21], 6);
-      in[22] = _mm_srai_epi16(in[22], 6);
-      in[23] = _mm_srai_epi16(in[23], 6);
-      in[24] = _mm_srai_epi16(in[24], 6);
-      in[25] = _mm_srai_epi16(in[25], 6);
-      in[26] = _mm_srai_epi16(in[26], 6);
-      in[27] = _mm_srai_epi16(in[27], 6);
-      in[28] = _mm_srai_epi16(in[28], 6);
-      in[29] = _mm_srai_epi16(in[29], 6);
-      in[30] = _mm_srai_epi16(in[30], 6);
-      in[31] = _mm_srai_epi16(in[31], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-      RECON_AND_STORE(dest, in[16]);
-      RECON_AND_STORE(dest, in[17]);
-      RECON_AND_STORE(dest, in[18]);
-      RECON_AND_STORE(dest, in[19]);
-      RECON_AND_STORE(dest, in[20]);
-      RECON_AND_STORE(dest, in[21]);
-      RECON_AND_STORE(dest, in[22]);
-      RECON_AND_STORE(dest, in[23]);
-      RECON_AND_STORE(dest, in[24]);
-      RECON_AND_STORE(dest, in[25]);
-      RECON_AND_STORE(dest, in[26]);
-      RECON_AND_STORE(dest, in[27]);
-      RECON_AND_STORE(dest, in[28]);
-      RECON_AND_STORE(dest, in[29]);
-      RECON_AND_STORE(dest, in[30]);
-      RECON_AND_STORE(dest, in[31]);
-
-      dest += 8 - (stride * 32);
-    }
-}  //NOLINT
-
-void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 4; ++i) {
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    dest += 8 - (stride * 32);
-  }
-}

diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c b/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
deleted file mode 100644
index 73bf5d1..0000000
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ /dev/null

@@ -1,762 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
-#include <tmmintrin.h>  // SSSE3
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-static void idct16_8col(__m128i *in, int round) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_add_epi16(t[0], t[1]);
-  u[1] = _mm_sub_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_sub_epi16(s[6], s[5]);
-  u[1] = _mm_add_epi16(s[6], s[5]);
-  t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  if (round == 1) {
-    s[0] = _mm_add_epi16(t[0], t[7]);
-    s[1] = _mm_add_epi16(t[1], t[6]);
-    s[2] = _mm_add_epi16(t[2], t[5]);
-    s[3] = _mm_add_epi16(t[3], t[4]);
-    s[4] = _mm_sub_epi16(t[3], t[4]);
-    s[5] = _mm_sub_epi16(t[2], t[5]);
-    s[6] = _mm_sub_epi16(t[1], t[6]);
-    s[7] = _mm_sub_epi16(t[0], t[7]);
-    s[8] = t[8];
-    s[9] = t[9];
-
-    u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-    u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-    v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-    v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-    v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-    v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-    v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-    v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-    v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-    v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-    u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-    u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-    u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-    u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-    u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-    u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-    u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-    u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-    u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-    u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-    u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-    u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-    u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-    u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-    u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-    u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-    s[10] = _mm_packs_epi32(u[0], u[1]);
-    s[13] = _mm_packs_epi32(u[2], u[3]);
-    s[11] = _mm_packs_epi32(u[4], u[5]);
-    s[12] = _mm_packs_epi32(u[6], u[7]);
-    s[14] = t[14];
-    s[15] = t[15];
-  } else {
-    s[0] = _mm_add_epi16(t[0], t[7]);
-    s[1] = _mm_add_epi16(t[1], t[6]);
-    s[2] = _mm_add_epi16(t[2], t[5]);
-    s[3] = _mm_add_epi16(t[3], t[4]);
-    s[4] = _mm_sub_epi16(t[3], t[4]);
-    s[5] = _mm_sub_epi16(t[2], t[5]);
-    s[6] = _mm_sub_epi16(t[1], t[6]);
-    s[7] = _mm_sub_epi16(t[0], t[7]);
-    s[8] = t[8];
-    s[9] = t[9];
-
-    u[0] = _mm_sub_epi16(t[13], t[10]);
-    u[1] = _mm_add_epi16(t[13], t[10]);
-    u[2] = _mm_sub_epi16(t[12], t[11]);
-    u[3] = _mm_add_epi16(t[12], t[11]);
-
-    s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-    s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-    s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
-    s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
-    s[14] = t[14];
-    s[15] = t[15];
-  }
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0, round);
-  idct16_8col(in1, round);
-}
-
-void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  __m128i in0[16], in1[16];
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  idct16_sse2(in0, in1, 0);
-  idct16_sse2(in0, in1, 1);
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
-static void idct16_10_r1(__m128i *in, __m128i *l) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_01 = dual_set_epi16(3212, 32610);
-  const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
-  const __m128i stg3_01 = dual_set_epi16(6392, 32138);
-  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
-
-
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  __m128i stp1_0, stp1_1, stp1_4, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
-    const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
-
-    stp2_8  = _mm_mulhrs_epi16(lo_1_15, stg2_01);
-    stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
-    stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-
-    stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-    stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp2, tmp4);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
-    const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
-    const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
-    const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
-    const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
-
-    tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
-    tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
-    tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
-
-    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
-    tmp0   = _mm_mulhrs_epi16(tmp0, stg4_01);
-    tmp4   = _mm_mulhrs_epi16(tmp4, stg4_01);
-
-    stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
-    stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-static void idct16_10_r2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  const __m128i stg2_0 = dual_set_epi16(3212, 3212);
-  const __m128i stg2_1 = dual_set_epi16(32610, 32610);
-  const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
-  const __m128i stg2_7 = dual_set_epi16(31358, 31358);
-  const __m128i stg3_0 = dual_set_epi16(6392, 6392);
-  const __m128i stg3_1 = dual_set_epi16(32138, 32138);
-  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  /* Stage2 */
-  {
-    stp1_8_0  = _mm_mulhrs_epi16(in[1], stg2_0);
-    stp1_15   = _mm_mulhrs_epi16(in[1], stg2_1);
-    stp1_11   = _mm_mulhrs_epi16(in[3], stg2_6);
-    stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
-  }
-
-  /* Stage3 */
-  {
-    stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
-    stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
-
-    stp1_9  =  stp1_8_0;
-    stp1_10 =  stp1_11;
-
-    stp1_13 = stp1_12_0;
-    stp1_14 = stp1_15;
-  }
-
-  /* Stage4 */
-  {
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
-    stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
-
-    stp2_5 = stp2_4;
-    stp2_6 = stp2_7;
-
-
-    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
-    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
-    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
-    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, 14);
-    tmp1 = _mm_srai_epi32(tmp1, 14);
-    tmp2 = _mm_srai_epi32(tmp2, 14);
-    tmp3 = _mm_srai_epi32(tmp3, 14);
-    tmp4 = _mm_srai_epi32(tmp4, 14);
-    tmp5 = _mm_srai_epi32(tmp5, 14);
-    tmp6 = _mm_srai_epi32(tmp6, 14);
-    tmp7 = _mm_srai_epi32(tmp7, 14);
-
-    stp2_9 = _mm_packs_epi32(tmp0, tmp1);
-    stp2_14 = _mm_packs_epi32(tmp2, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp4, tmp5);
-    stp2_13 = _mm_packs_epi32(tmp6, tmp7);
-  }
-
-  /* Stage5 */
-  {
-    stp1_2 = stp1_0;
-    stp1_3 = stp1_0;
-
-    tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
-    tmp1 = _mm_add_epi16(stp2_6, stp2_5);
-
-    stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
-    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
-
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
-  }
-
-  /* Stage6 */
-  {
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
-    stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
-
-    tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
-    tmp1 = _mm_add_epi16(stp1_13, stp1_10);
-    tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
-    tmp3 = _mm_add_epi16(stp1_12, stp1_11);
-
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-    stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
-    stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
-    stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
-    stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
-  }
-
-  // Stage7
-  in[0] = _mm_add_epi16(stp2_0, stp1_15);
-  in[1] = _mm_add_epi16(stp2_1, stp1_14);
-  in[2] = _mm_add_epi16(stp2_2, stp2_13);
-  in[3] = _mm_add_epi16(stp2_3, stp2_12);
-  in[4] = _mm_add_epi16(stp2_4, stp2_11);
-  in[5] = _mm_add_epi16(stp2_5, stp2_10);
-  in[6] = _mm_add_epi16(stp2_6, stp1_9);
-  in[7] = _mm_add_epi16(stp2_7, stp1_8);
-  in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i in[16], l[16];
-
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  idct16_10_r1(in, l);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    array_transpose_4X8(l + 8*i, in);
-
-    idct16_10_r2(in);
-
-    // Final rounding and shift
-    in[0] = _mm_adds_epi16(in[0], final_rounding);
-    in[1] = _mm_adds_epi16(in[1], final_rounding);
-    in[2] = _mm_adds_epi16(in[2], final_rounding);
-    in[3] = _mm_adds_epi16(in[3], final_rounding);
-    in[4] = _mm_adds_epi16(in[4], final_rounding);
-    in[5] = _mm_adds_epi16(in[5], final_rounding);
-    in[6] = _mm_adds_epi16(in[6], final_rounding);
-    in[7] = _mm_adds_epi16(in[7], final_rounding);
-    in[8] = _mm_adds_epi16(in[8], final_rounding);
-    in[9] = _mm_adds_epi16(in[9], final_rounding);
-    in[10] = _mm_adds_epi16(in[10], final_rounding);
-    in[11] = _mm_adds_epi16(in[11], final_rounding);
-    in[12] = _mm_adds_epi16(in[12], final_rounding);
-    in[13] = _mm_adds_epi16(in[13], final_rounding);
-    in[14] = _mm_adds_epi16(in[14], final_rounding);
-    in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-    in[0] = _mm_srai_epi16(in[0], 6);
-    in[1] = _mm_srai_epi16(in[1], 6);
-    in[2] = _mm_srai_epi16(in[2], 6);
-    in[3] = _mm_srai_epi16(in[3], 6);
-    in[4] = _mm_srai_epi16(in[4], 6);
-    in[5] = _mm_srai_epi16(in[5], 6);
-    in[6] = _mm_srai_epi16(in[6], 6);
-    in[7] = _mm_srai_epi16(in[7], 6);
-    in[8] = _mm_srai_epi16(in[8], 6);
-    in[9] = _mm_srai_epi16(in[9], 6);
-    in[10] = _mm_srai_epi16(in[10], 6);
-    in[11] = _mm_srai_epi16(in[11], 6);
-    in[12] = _mm_srai_epi16(in[12], 6);
-    in[13] = _mm_srai_epi16(in[13], 6);
-    in[14] = _mm_srai_epi16(in[14], 6);
-    in[15] = _mm_srai_epi16(in[15], 6);
-
-    RECON_AND_STORE(dest, in[0]);
-    RECON_AND_STORE(dest, in[1]);
-    RECON_AND_STORE(dest, in[2]);
-    RECON_AND_STORE(dest, in[3]);
-    RECON_AND_STORE(dest, in[4]);
-    RECON_AND_STORE(dest, in[5]);
-    RECON_AND_STORE(dest, in[6]);
-    RECON_AND_STORE(dest, in[7]);
-    RECON_AND_STORE(dest, in[8]);
-    RECON_AND_STORE(dest, in[9]);
-    RECON_AND_STORE(dest, in[10]);
-    RECON_AND_STORE(dest, in[11]);
-    RECON_AND_STORE(dest, in[12]);
-    RECON_AND_STORE(dest, in[13]);
-    RECON_AND_STORE(dest, in[14]);
-    RECON_AND_STORE(dest, in[15]);
-
-    dest += 8 - (stride * 16);
-  }
-}

diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
deleted file mode 100644
index 69b07f6..0000000
--- a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
+++ /dev/null

@@ -1,357 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-
-SECTION .text
-
-INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  punpckldq             m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_MMX sse
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_MMX sse
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_MMX sse
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_MMX sse
-cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movd                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpcklbw             m0, m1
-  pshufw                m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  add                leftq, 4
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshufw                m2, m2, 0x0
-  pshufw                m3, m3, 0x0
-  paddw                 m2, m0
-  paddw                 m3, m0
-  packuswb              m2, m2
-  packuswb              m3, m3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movq                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -4
-  punpcklqdq            m2, m2
-  add                leftq, 8
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m2, m0
-  paddw                 m3, m0
-  packuswb              m2, m3
-  movq      [dstq        ], m2
-  movhps    [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpckhbw             m4, m0, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -8
-  punpcklqdq            m2, m2
-  add                leftq, 16
-  psubw                 m0, m2
-  psubw                 m4, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m5, m2, m0
-  paddw                 m6, m3, m0
-  paddw                 m2, m4
-  paddw                 m3, m4
-  packuswb              m5, m2
-  packuswb              m6, m3
-  mova      [dstq        ], m5
-  mova      [dstq+strideq], m6
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-%if ARCH_X86_64
-INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  mova                  m4, [aboveq+16]
-  punpcklbw             m2, m1
-  punpckhbw             m3, m0, m1
-  punpckhbw             m5, m4, m1
-  punpcklbw             m0, m1
-  punpcklbw             m4, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -16
-  punpcklqdq            m2, m2
-  add                leftq, 32
-  psubw                 m0, m2
-  psubw                 m3, m2
-  psubw                 m4, m2
-  psubw                 m5, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m6, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m6, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m6, m6, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m6, m6
-  paddw                 m7, m2, m0
-  paddw                 m8, m2, m3
-  paddw                 m9, m2, m4
-  paddw                 m2, m5
-  packuswb              m7, m8
-  packuswb              m9, m2
-  paddw                 m2, m6, m0
-  paddw                 m8, m6, m3
-  mova   [dstq           ], m7
-  paddw                 m7, m6, m4
-  paddw                 m6, m5
-  mova   [dstq        +16], m9
-  packuswb              m2, m8
-  packuswb              m7, m6
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m7
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-%endif

diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 0000000..6029420
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm

@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08

diff --git a/libvpx/vp9/common/x86/vp9_postproc_mmx.asm b/libvpx/vp9/common/x86/vp9_postproc_mmx.asm
deleted file mode 100644
index 5b8deef..0000000
--- a/libvpx/vp9/common/x86/vp9_postproc_mmx.asm
+++ /dev/null

@@ -1,533 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
-sym(vp9_post_proc_down_and_across_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movq        mm0, [GLOBAL(rd)]
-    sub         rsp, 8
-    movq        [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
-        push        rbx
-        lea         rbx, [GLOBAL(Blur)]
-        movd        mm2, dword ptr arg(6) ;flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx, DWORD PTR arg(4) ;rows
-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        neg         rax
-        movq        mm6, [rbx ]           ; kernel 0 taps
-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]       ; kernel 1 taps
-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-
-        movd        [rdi], mm1            ;
-        neg         rax                   ; pitch is positive
-
-
-        add         rsi, 4
-        add         rdi, 4
-        add         rdx, 4
-
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .nextcol
-        ; done with the all cols, start the across filtering in place
-        sub         rsi, rdx
-        sub         rdi, rdx
-
-
-        push        rax
-        xor         rdx,    rdx
-        mov         rax,    [rdi-4];
-
-.acrossnextcol:
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ;
-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
-        movq        mm3, mm4              ; mm3 = p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]
-        psrlq       mm4, 8                ; mm4 = p1..p7
-        movq        mm5, mm4              ; mm5 = p1..p7
-        punpcklbw   mm5, mm0              ; mm5 = p1..p4
-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = p0..p3
-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]
-        psrlq       mm4, 8                ; mm4 = p2..p7
-        movq        mm5, mm4              ; mm5 = p2..p7
-        punpcklbw   mm5, mm0              ; mm5 = p2..p5
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        movq        mm6, [rbx ]
-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
-        movq        mm5, mm4              ; mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]
-        psrlq       mm4, 8                ; mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
-        movd        eax,    mm1
-
-        add         rdx, 4
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .acrossnextcol;
-
-        mov         DWORD PTR [rdi+rdx-4],  eax
-        pop         rax
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow               ; next row
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-;                             int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE
-sym(vp9_mbpost_proc_down_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 136
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vp9_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword ptr arg(2), 8
-
-    ;for(c=0; c<cols; c+=4)
-.loop_col:
-            mov         rsi,        arg(0)  ;s
-            pxor        mm0,        mm0     ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-            neg         rax                                     ; rax = -pitch
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-
-            pxor        mm5,        mm5
-            pxor        mm6,        mm6     ;
-
-            pxor        mm7,        mm7     ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movd        mm1,        DWORD PTR [rdi];
-            punpcklbw   mm1,        mm0     ;
-
-            paddw       mm5,        mm1     ;
-            pmullw      mm1,        mm1     ;
-
-            movq        mm2,        mm1     ;
-            punpcklwd   mm1,        mm0     ;
-
-            punpckhwd   mm2,        mm0     ;
-            paddd       mm6,        mm1     ;
-
-            paddd       mm7,        mm2     ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
-            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   mm1,        mm0
-            punpcklbw   mm2,        mm0
-
-            paddw       mm5,        mm2
-            psubw       mm5,        mm1
-
-            pmullw      mm2,        mm2
-            movq        mm4,        mm2
-
-            punpcklwd   mm2,        mm0
-            punpckhwd   mm4,        mm0
-
-            paddd       mm6,        mm2
-            paddd       mm7,        mm4
-
-            pmullw      mm1,        mm1
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm0
-            psubd       mm6,        mm1
-
-            punpckhwd   mm2,        mm0
-            psubd       mm7,        mm2
-
-
-            movq        mm3,        mm6
-            pslld       mm3,        4
-
-            psubd       mm3,        mm6
-            movq        mm1,        mm5
-
-            movq        mm4,        mm5
-            pmullw      mm1,        mm1
-
-            pmulhw      mm4,        mm4
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm4
-            punpckhwd   mm2,        mm4
-
-            movq        mm4,        mm7
-            pslld       mm4,        4
-
-            psubd       mm4,        mm7
-
-            psubd       mm3,        mm1
-            psubd       mm4,        mm2
-
-            psubd       mm3,        flimit2
-            psubd       mm4,        flimit2
-
-            psrad       mm3,        31
-            psrad       mm4,        31
-
-            packssdw    mm3,        mm4
-            packsswb    mm3,        mm0
-
-            movd        mm1,        DWORD PTR [rsi+rax*8]
-
-            movq        mm2,        mm1
-            punpcklbw   mm1,        mm0
-
-            paddw       mm1,        mm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vp9_rv))]
-            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
-            movq        mm4,        [sym(vp9_rv) + rcx*2]
-%endif
-            paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
-            psraw       mm1,        4
-
-            packuswb    mm1,        mm0
-            pand        mm1,        mm3
-
-            pandn       mm3,        mm2
-            por         mm1,        mm3
-
-            and         rcx,        15
-            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-
-            and         rcx,        15
-            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
-            movd        [rsi],      mm1
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-
-        add         dword arg(0), 4 ; s += 4
-        sub         dword arg(3), 4 ; cols -= 4
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 136
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_mmx) PRIVATE
-sym(vp9_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-Blur:
-    times 16 dw 16
-    times  8 dw 64
-    times 16 dw 16
-    times  8 dw  0
-
-rd:
-    times 4 dw 0x40

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index c4efa65..0000000
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null

@@ -1,492 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr+=src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-    // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-    // filter the source buffer
-    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes.
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    // filter the source buffer
-    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    src_ptr+=src_pixels_per_line;
-
-    // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-    output_ptr+=output_pitch;
-  }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  for (i = 0; i < output_height; i++) {
-    // load the first 8 bytes
-    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  for (i = 0; i < output_height; i++) {
-    // load the first 16 bytes
-    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-    // load the next 16 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
-    // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-    // load the next 16 bytes in stride of two/three src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
-    // merge the result together
-    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
-    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-    // load the next 16 bytes in stride of four/five src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
-    // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-    // add and saturate the results together
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-    src_ptr+=src_pitch;
-
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-    output_ptr+=out_pitch;
-  }
-}

diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c
index 0797168..fb7b3b8 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libvpx/vp9/decoder/vp9_decodeframe.c

@@ -12,23 +12,28 @@
 #include <stdlib.h>  // qsort()
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -36,9 +41,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -73,19 +75,19 @@
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
-  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
+  const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max));
   return data > max ? max : data;
 }
 
-static TX_MODE read_tx_mode(vp9_reader *r) {
-  TX_MODE tx_mode = vp9_read_literal(r, 2);
+static TX_MODE read_tx_mode(vpx_reader *r) {
+  TX_MODE tx_mode = vpx_read_literal(r, 2);
   if (tx_mode == ALLOW_32X32)
-    tx_mode += vp9_read_bit(r);
+    tx_mode += vpx_read_bit(r);
   return tx_mode;
 }
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -101,14 +103,14 @@
       vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
@@ -116,9 +118,9 @@
 }
 
 static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
                                               : COMPOUND_REFERENCE)
                            : SINGLE_REFERENCE;
   } else {
@@ -126,8 +128,8 @@
   }
 }
 
-static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
-  FRAME_CONTEXT *const fc = &cm->fc;
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) {
+  FRAME_CONTEXT *const fc = cm->fc;
   int i;
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -145,14 +147,14 @@
       vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
-static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
-    if (vp9_read(r, MV_UPDATE_PROB))
-      p[i] = (vp9_read_literal(r, 7) << 1) | 1;
+    if (vpx_read(r, MV_UPDATE_PROB))
+      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
@@ -181,136 +183,623 @@
   }
 }
 
-static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
-  int i;
-  xd->plane[0].dequant = cm->y_dequant[q_index];
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = cm->uv_dequant[q_index];
-}
-
-static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, uint8_t *dst, int stride,
-                                    int eob) {
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
-    TX_TYPE tx_type = DCT_DCT;
-    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
     if (xd->lossless) {
-      tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
-      const PLANE_TYPE plane_type = pd->plane_type;
       switch (tx_size) {
         case TX_4X4:
-          tx_type = get_tx_type_4x4(plane_type, xd, block);
-          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct4x4_add(dqcoeff, dst, stride, eob);
           break;
         case TX_8X8:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct8x8_add(dqcoeff, dst, stride, eob);
           break;
         case TX_16X16:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct16x16_add(dqcoeff, dst, stride, eob);
           break;
         case TX_32X32:
-          tx_type = DCT_DCT;
           vp9_idct32x32_add(dqcoeff, dst, stride, eob);
           break;
         default:
           assert(0 && "Invalid transform size");
+          return;
       }
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
-      vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+      dqcoeff[0] = 0;
     } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      if (tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
       else if (tx_size == TX_32X32 && eob <= 34)
-        vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
     }
   }
 }
 
-struct intra_args {
-  VP9_COMMON *cm;
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-};
-
-static void predict_and_reconstruct_intra_block(int plane, int block,
-                                                BLOCK_SIZE plane_bsize,
-                                                TX_SIZE tx_size, void *arg) {
-  struct intra_args *const args = (struct intra_args *)arg;
-  VP9_COMMON *const cm = args->cm;
-  MACROBLOCKD *const xd = args->xd;
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+                                          const TX_TYPE tx_type,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mi[0];
-  const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
-                                            : mi->mbmi.uv_mode;
-  int x, y;
-  uint8_t *dst;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
+    if (xd->lossless) {
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  vp9_predict_intra_block(xd, block >> (tx_size << 1),
-                          b_width_log2(plane_bsize), tx_size, mode,
-                          dst, pd->dst.stride, dst, pd->dst.stride,
-                          x, y, plane);
-
-  if (!mi->mbmi.skip) {
-    const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
-                                            plane_bsize, x, y, tx_size,
-                                            args->r);
-    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
-                            eob);
+    if (eob == 1) {
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
   }
 }
 
-struct inter_args {
-  VP9_COMMON *cm;
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-  int *eobtotal;
-};
-
-static void reconstruct_inter_block(int plane, int block,
-                                    BLOCK_SIZE plane_bsize,
-                                    TX_SIZE tx_size, void *arg) {
-  struct inter_args *args = (struct inter_args *)arg;
-  VP9_COMMON *const cm = args->cm;
-  MACROBLOCKD *const xd = args->xd;
+static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
+                                                vpx_reader *r,
+                                                MB_MODE_INFO *const mbmi,
+                                                int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int x, y, eob;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y,
-                                tx_size, args->r);
-  inverse_transform_block(xd, plane, block, tx_size,
-                          &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
-                          pd->dst.stride, eob);
-  *args->eobtotal += eob;
+  PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
+  uint8_t *dst;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (mbmi->sb_type < BLOCK_8X8)
+    if (plane == 0)
+      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode,
+                          dst, pd->dst.stride, dst, pd->dst.stride,
+                          col, row, plane);
+
+  if (!mbmi->skip) {
+    const TX_TYPE tx_type = (plane || xd->lossless) ?
+        DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    const scan_order *sc = (plane || xd->lossless) ?
+        &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
+    const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
+                                            r, mbmi->segment_id);
+    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                  dst, pd->dst.stride, eob);
+  }
+}
+
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
+                                   MB_MODE_INFO *const mbmi, int plane,
+                                   int row, int col, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+                                          mbmi->segment_id);
+
+  inverse_transform_block_inter(xd, plane, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, eob);
+  return eob;
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+                                 uint16_t *dst, int dst_stride,
+                                 int x, int y, int b_w, int b_h,
+                                 int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      vpx_memset16(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right)
+      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               MACROBLOCKD *xd,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
+                         x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+  } else {
+    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
+                    x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+#else
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
+                  x0, y0, b_w, b_h, frame_width, frame_height);
+  buf_ptr = mc_buf + border_offset;
+
+  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int plane, int bw, int bh, int x,
+                                       int y, int w, int h, int mi_x, int mi_y,
+                                       const InterpKernel *kernel,
+                                       const struct scale_factors *sf,
+                                       struct buf_2d *pre_buf,
+                                       struct buf_2d *dst_buf, const MV* mv,
+                                       RefCntBuffer *ref_frame_buf,
+                                       int is_scaled, int ref) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
+
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
+
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
+
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
+    }
+
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
+    }
+
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (pbi->frame_parallel_decode)
+      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      // Extend the border.
+      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
+      const int b_w = x1 - x0 + 1;
+      const int b_h = y1 - y0 + 1;
+      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+
+      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+                         frame_width, frame_height, border_offset,
+                         dst, dst_buf->stride,
+                         subpel_x, subpel_y,
+                         kernel, sf,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         xd,
+#endif
+                         w, h, ref, xs, ys);
+      return;
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+     if (pbi->frame_parallel_decode) {
+       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+     }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
+  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+  const int is_compound = has_second_ref(&mi->mbmi);
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+
+    const int n4w_x4 = 4 * num_4x4_w;
+    const int n4h_x4 = 4 * num_4x4_h;
+    int ref;
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp9_is_scaled(sf);
+
+      if (sb_type < BLOCK_8X8) {
+        int i = 0, x, y;
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      } else {
+        const MV mv = mi->mbmi.mv[ref].as_mv;
+        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                   is_scaled, ref);
+      }
+    }
+  }
+}
+
+static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                         int n4_wl, int n4_hl) {
+  // get minimum log2 num4x4s dimension
+  const int x = MIN(n4_wl, n4_hl);
+  return MIN(mbmi->tx_size,  x);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+  }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                         int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
 }
 
 static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 const TileInfo *const tile,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int bw, int bh, int x_mis, int y_mis,
+                                 int bwl, int bhl) {
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
+  const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x)
+    for (x = !y; x < x_mis; ++x) {
       xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+    }
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
 
   set_skip_context(xd, mi_row, mi_col);
 
@@ -322,127 +811,188 @@
   return &xd->mi[0]->mbmi;
 }
 
-static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                    int idx, int mi_row, int mi_col) {
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
-  xd->block_refs[idx] = ref_buffer;
-  if (!vp9_is_valid_scale(&ref_buffer->sf))
-    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                       "Invalid scale factors");
-  vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
-                       &ref_buffer->sf);
-  xd->corrupted |= ref_buffer->buf->corrupted;
-}
-
-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                         const TileInfo *const tile,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          int mi_row, int mi_col,
-                         vp9_reader *r, BLOCK_SIZE bsize) {
+                         vpx_reader *r, BLOCK_SIZE bsize,
+                         int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
 
-  if (less8x8)
-    bsize = BLOCK_8X8;
+  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                                   bw, bh, x_mis, y_mis, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info,
+                         VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
+  }
+
+  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
   if (mbmi->skip) {
-    reset_skip_context(xd, bsize);
-  } else {
-    if (cm->seg.enabled)
-      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
-                                                  cm->base_qindex));
+    dec_reset_skip_context(xd);
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = { cm, xd, r };
-    vp9_foreach_transformed_block(xd, bsize,
-                                  predict_and_reconstruct_intra_block, &arg);
-  } else {
-    // Setup
-    set_ref(cm, xd, 0, mi_row, mi_col);
-    if (has_second_ref(mbmi))
-      set_ref(cm, xd, 1, mi_row, mi_col);
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size =
+          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                  : mbmi->tx_size;
+      const int num_4x4_w = pd->n4_w;
+      const int num_4x4_h = pd->n4_h;
+      const int step = (1 << tx_size);
+      int row, col;
+      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+      for (row = 0; row < max_blocks_high; row += step)
+        for (col = 0; col < max_blocks_wide; col += step)
+          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+                                              row, col, tx_size);
+    }
+  } else {
     // Prediction
-    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
 
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
-      struct inter_args arg = { cm, xd, r, &eobtotal };
-      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                    : mbmi->tx_size;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int step = (1 << tx_size);
+        int row, col;
+        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+                                                tx_size);
+      }
+
       if (!less8x8 && eobtotal == 0)
         mbmi->skip = 1;  // skip loopfilter
     }
   }
 
-  xd->corrupted |= vp9_reader_has_error(r);
+  xd->corrupted |= vpx_reader_has_error(r);
 }
 
-static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
-                                     int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                     vp9_reader *r) {
-  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = get_partition_probs(cm, ctx);
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int bsl) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+//  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                int bw) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bw);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     vpx_reader *r,
+                                     int has_rows, int has_cols, int bsl) {
+  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
+  const vpx_prob *const probs = get_partition_probs(xd, ctx);
+  FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    p = (PARTITION_TYPE)vp9_read_tree(r, vp9_partition_tree, probs);
+    p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
   else if (!has_rows && has_cols)
-    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.partition[ctx][p];
+  if (counts)
+    ++counts->partition[ctx][p];
 
   return p;
 }
 
-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                             const TileInfo *const tile,
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                              int mi_row, int mi_col,
-                             vp9_reader* r, BLOCK_SIZE bsize) {
-  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
-  BLOCK_SIZE subsize, uv_subsize;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
-  subsize = get_subsize(bsize, partition);
-  uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
-  if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Invalid block size.");
-  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+  partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+                             n8x8_l2);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
-        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        if (has_rows)
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+                       n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
-        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        if (has_cols)
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+                       n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+                         n8x8_l2);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -452,14 +1002,14 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
 static void setup_token_decoder(const uint8_t *data,
                                 const uint8_t *data_end,
                                 size_t read_size,
                                 struct vpx_internal_error_info *error_info,
-                                vp9_reader *r,
+                                vpx_reader *r,
                                 vpx_decrypt_cb decrypt_cb,
                                 void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
@@ -469,16 +1019,16 @@
     vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
-  if (vp9_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+  if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
     vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
 static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
-                                   vp9_reader *r) {
+                                   vpx_reader *r) {
   int i, j, k, l, m;
 
-  if (vp9_read_bit(r))
+  if (vpx_read_bit(r))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
@@ -488,7 +1038,7 @@
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
-                            vp9_reader *r) {
+                            vpx_reader *r) {
     const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
     TX_SIZE tx_size;
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
@@ -496,27 +1046,27 @@
 }
 
 static void setup_segmentation(struct segmentation *seg,
-                               struct vp9_read_bit_buffer *rb) {
+                               struct vpx_read_bit_buffer *rb) {
   int i, j;
 
   seg->update_map = 0;
   seg->update_data = 0;
 
-  seg->enabled = vp9_rb_read_bit(rb);
+  seg->enabled = vpx_rb_read_bit(rb);
   if (!seg->enabled)
     return;
 
   // Segmentation map update
-  seg->update_map = vp9_rb_read_bit(rb);
+  seg->update_map = vpx_rb_read_bit(rb);
   if (seg->update_map) {
     for (i = 0; i < SEG_TREE_PROBS; i++)
-      seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                : MAX_PROB;
 
-    seg->temporal_update = vp9_rb_read_bit(rb);
+    seg->temporal_update = vpx_rb_read_bit(rb);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++)
-        seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                  : MAX_PROB;
     } else {
       for (i = 0; i < PREDICTION_PROBS; i++)
@@ -525,21 +1075,21 @@
   }
 
   // Segmentation data update
-  seg->update_data = vp9_rb_read_bit(rb);
+  seg->update_data = vpx_rb_read_bit(rb);
   if (seg->update_data) {
-    seg->abs_delta = vp9_rb_read_bit(rb);
+    seg->abs_delta = vpx_rb_read_bit(rb);
 
     vp9_clearall_segfeatures(seg);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
-        const int feature_enabled = vp9_rb_read_bit(rb);
+        const int feature_enabled = vpx_rb_read_bit(rb);
         if (feature_enabled) {
           vp9_enable_segfeature(seg, i, j);
           data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
           if (vp9_is_segfeature_signed(j))
-            data = vp9_rb_read_bit(rb) ? -data : data;
+            data = vpx_rb_read_bit(rb) ? -data : data;
         }
         vp9_set_segdata(seg, i, j, data);
       }
@@ -548,95 +1098,122 @@
 }
 
 static void setup_loopfilter(struct loopfilter *lf,
-                             struct vp9_read_bit_buffer *rb) {
-  lf->filter_level = vp9_rb_read_literal(rb, 6);
-  lf->sharpness_level = vp9_rb_read_literal(rb, 3);
+                             struct vpx_read_bit_buffer *rb) {
+  lf->filter_level = vpx_rb_read_literal(rb, 6);
+  lf->sharpness_level = vpx_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   lf->mode_ref_delta_update = 0;
 
-  lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb);
+  lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb);
   if (lf->mode_ref_delta_enabled) {
-    lf->mode_ref_delta_update = vp9_rb_read_bit(rb);
+    lf->mode_ref_delta_update = vpx_rb_read_bit(rb);
     if (lf->mode_ref_delta_update) {
       int i;
 
       for (i = 0; i < MAX_REF_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
     }
   }
 }
 
-static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
-  const int old = *delta_q;
-  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
-  return old != *delta_q;
+static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
 }
 
 static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               struct vp9_read_bit_buffer *rb) {
-  int update = 0;
-
-  cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
-  update |= read_delta_q(rb, &cm->y_dc_delta_q);
-  update |= read_delta_q(rb, &cm->uv_dc_delta_q);
-  update |= read_delta_q(rb, &cm->uv_ac_delta_q);
-  if (update)
-    vp9_init_dequantizer(cm);
-
+                               struct vpx_read_bit_buffer *rb) {
+  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
+  cm->y_dc_delta_q = read_delta_q(rb);
+  cm->uv_dc_delta_q = read_delta_q(rb);
+  cm->uv_ac_delta_q = read_delta_q(rb);
+  cm->dequant_bit_depth = cm->bit_depth;
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
 }
 
-static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
+static void setup_segmentation_dequant(VP9_COMMON *const cm) {
+  // Build y/uv dequant values based on segmentation.
+  if (cm->seg.enabled) {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex);
+      cm->y_dequant[i][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q,
+                                         cm->bit_depth);
+      cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+      cm->uv_dequant[i][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                          cm->bit_depth);
+      cm->uv_dequant[i][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                          cm->bit_depth);
+    }
+  } else {
+    const int qindex = cm->base_qindex;
+    // When segmentation is disabled, only the first value is used.  The
+    // remaining are don't cares.
+    cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+    cm->uv_dequant[0][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                        cm->bit_depth);
+    cm->uv_dequant[0][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                        cm->bit_depth);
+  }
+}
+
+static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
   const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
                                               EIGHTTAP,
                                               EIGHTTAP_SHARP,
                                               BILINEAR };
-  return vp9_rb_read_bit(rb) ? SWITCHABLE
-                             : literal_to_filter[vp9_rb_read_literal(rb, 2)];
+  return vpx_rb_read_bit(rb) ? SWITCHABLE
+                             : literal_to_filter[vpx_rb_read_literal(rb, 2)];
 }
 
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
-                         int *width, int *height) {
-  const int w = vp9_rb_read_literal(rb, 16) + 1;
-  const int h = vp9_rb_read_literal(rb, 16) + 1;
-  *width = w;
-  *height = h;
-}
-
-static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_display_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   cm->display_width = cm->width;
   cm->display_height = cm->height;
-  if (vp9_rb_read_bit(rb))
+  if (vpx_rb_read_bit(rb))
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
+static void resize_mv_buffer(VP9_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Width and height beyond allowed size.");
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 #endif
   if (cm->width != width || cm->height != height) {
-    const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-    const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2;
 
-    // Change in frame size (assumption: color format does not change).
-    if (cm->width == 0 || cm->height == 0 ||
-        aligned_width > cm->width ||
-        aligned_width * aligned_height > cm->width * cm->height) {
+    // Allocations in vp9_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffers");
+                           "Failed to allocate context buffers");
     } else {
       vp9_set_mb_mi(cm, width, height);
     }
@@ -644,31 +1221,58 @@
     cm->width = width;
     cm->height = height;
   }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
 }
 
-static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int width, height;
+  BufferPool *const pool = cm->buffer_pool;
   vp9_read_frame_size(rb, &width, &height);
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
 
-  if (vp9_realloc_frame_buffer(
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          vpx_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
 }
 
 static void setup_frame_size_with_refs(VP9_COMMON *cm,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   int width, height;
   int found = 0, i;
   int has_valid_ref_frame = 0;
+  BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
-    if (vp9_rb_read_bit(rb)) {
+    if (vpx_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
@@ -680,7 +1284,7 @@
   if (!found)
     vp9_read_frame_size(rb, &width, &height);
 
-  if (width <=0 || height <= 0)
+  if (width <= 0 || height <= 0)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Invalid frame size");
 
@@ -695,28 +1299,53 @@
   if (!has_valid_ref_frame)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth,
+            ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y,
+            cm->bit_depth,
+            cm->subsampling_x,
+            cm->subsampling_y))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
 
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
 
-  if (vp9_realloc_frame_buffer(
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
-static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int min_log2_tile_cols, max_log2_tile_cols, max_ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   max_ones = max_log2_tile_cols - min_log2_tile_cols;
   cm->log2_tile_cols = min_log2_tile_cols;
-  while (max_ones-- && vp9_rb_read_bit(rb))
+  while (max_ones-- && vpx_rb_read_bit(rb))
     cm->log2_tile_cols++;
 
   if (cm->log2_tile_cols > 6)
@@ -724,9 +1353,9 @@
                        "Invalid number of tile columns");
 
   // rows
-  cm->log2_tile_rows = vp9_rb_read_bit(rb);
+  cm->log2_tile_rows = vpx_rb_read_bit(rb);
   if (cm->log2_tile_rows)
-    cm->log2_tile_rows += vp9_rb_read_bit(rb);
+    cm->log2_tile_rows += vpx_rb_read_bit(rb);
 }
 
 typedef struct TileBuffer {
@@ -793,7 +1422,7 @@
                                    const uint8_t *data,
                                    const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -802,24 +1431,23 @@
   int mi_row, mi_col;
   TileData *tile_data = NULL;
 
-  if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
+  if (cm->lf.filter_level && !cm->skip_loop_filter &&
+      pbi->lf_worker.data1 == NULL) {
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+    pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
     }
   }
 
-  if (cm->lf.filter_level) {
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    lf_data->frame_buffer = get_frame_new_buffer(cm);
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->stop = 0;
-    lf_data->y_only = 0;
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+    // Be sure to sync as we might be resuming after a failed frame decode.
+    winterface->sync(&pbi->lf_worker);
+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
   }
 
   assert(tile_rows <= 4);
@@ -827,11 +1455,11 @@
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context, 0,
-             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
 
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(*cm->above_seg_context) * aligned_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_cols);
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -848,18 +1476,19 @@
   // Load all tile information into tile_data.
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo tile;
       const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
       tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
       tile_data->cm = cm;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col);
+      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
+                             NULL : &cm->counts;
+      vp9_zero(tile_data->dqcoeff);
+      vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
     }
   }
 
@@ -877,13 +1506,16 @@
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64);
+          decode_partition(pbi, &tile_data->xd, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
+        if (pbi->mb.corrupted)
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Failed to decode tile data");
       }
       // Loopfilter one row.
-      if (cm->lf.filter_level) {
+      if (cm->lf.filter_level && !cm->skip_loop_filter) {
         const int lf_start = mi_row - MI_BLOCK_SIZE;
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
@@ -902,11 +1534,17 @@
           winterface->execute(&pbi->lf_worker);
         }
       }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
   // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level) {
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     winterface->sync(&pbi->lf_worker);
     lf_data->start = lf_data->stop;
@@ -917,22 +1555,33 @@
   // Get last tile data.
   tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
 
-  return vp9_reader_find_end(&tile_data->bit_reader);
+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
+  return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
-static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
-  const TileInfo *const tile = (TileInfo*)arg2;
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            const TileInfo *const tile) {
   int mi_row, mi_col;
 
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    tile_data->xd.corrupted = 1;
+    return 0;
+  }
+
+  tile_data->error_info.setjmp = 1;
+  tile_data->xd.error_info = &tile_data->error_info;
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     vp9_zero(tile_data->xd.left_context);
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->cm, &tile_data->xd, tile,
-                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+      decode_partition(tile_data->pbi, &tile_data->xd,
+                       mi_row, mi_col, &tile_data->bit_reader,
+                       BLOCK_64X64, 4);
     }
   }
   return !tile_data->xd.corrupted;
@@ -942,20 +1591,14 @@
 static int compare_tile_buffers(const void *a, const void *b) {
   const TileBuffer *const buf1 = (const TileBuffer*)a;
   const TileBuffer *const buf2 = (const TileBuffer*)b;
-  if (buf1->size < buf2->size) {
-    return 1;
-  } else if (buf1->size == buf2->size) {
-    return 0;
-  } else {
-    return -1;
-  }
+  return (int)(buf2->size - buf1->size);
 }
 
 static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
                                       const uint8_t *data,
                                       const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const uint8_t *bit_reader_end = NULL;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -974,18 +1617,21 @@
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads & ~1;
     int i;
-    // TODO(jzern): Allocate one less worker, as in the current code we only
-    // use num_threads - 1 workers.
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
     for (i = 0; i < num_threads; ++i) {
-      VP9Worker *const worker = &pbi->tile_workers[i];
+      VPxWorker *const worker = &pbi->tile_workers[i];
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      vpx_memalign(32, sizeof(TileWorkerData)));
-      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
       if (i < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
@@ -995,15 +1641,19 @@
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
-    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    winterface->sync(worker);
+    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[n];
+    worker->data2 = &pbi->tile_worker_info[n];
   }
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context, 0,
-             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(*cm->above_seg_context) * aligned_mi_cols);
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
 
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
@@ -1028,24 +1678,38 @@
     }
   }
 
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    int i;
+
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp9_zero(tile_data->counts);
+    }
+  }
+
   n = 0;
   while (n < tile_cols) {
     int i;
     for (i = 0; i < num_workers && n < tile_cols; ++i) {
-      VP9Worker *const worker = &pbi->tile_workers[i];
+      VPxWorker *const worker = &pbi->tile_workers[i];
       TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
       TileInfo *const tile = (TileInfo*)worker->data2;
       TileBuffer *const buf = &tile_buffers[0][n];
 
-      tile_data->cm = cm;
+      tile_data->pbi = pbi;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
+                             0 : &tile_data->counts;
+      vp9_zero(tile_data->dqcoeff);
+      vp9_tile_init(tile, cm, 0, buf->col);
+      vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1062,15 +1726,28 @@
     }
 
     for (; i > 0; --i) {
-      VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      VPxWorker *const worker = &pbi->tile_workers[i - 1];
+      // TODO(jzern): The tile may have specific error data associated with
+      // its vpx_internal_error_info which could be propagated to the main info
+      // in cm. Additionally once the threads have been synced and an error is
+      // detected, there's no point in continuing to decode tiles.
       pbi->mb.corrupted |= !winterface->sync(worker);
     }
     if (final_worker > -1) {
       TileWorkerData *const tile_data =
           (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
       final_worker = -1;
     }
+
+    // Accumulate thread frame counts.
+    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+      for (i = 0; i < num_workers; ++i) {
+        TileWorkerData *const tile_data =
+            (TileWorkerData*)pbi->tile_workers[i].data1;
+        vp9_accumulate_frame_counts(cm, &tile_data->counts, 1);
+      }
+    }
   }
 
   return bit_reader_end;
@@ -1081,34 +1758,29 @@
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
-  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
-}
-
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
-  int profile = vp9_rb_read_bit(rb);
-  profile |= vp9_rb_read_bit(rb) << 1;
-  if (profile > 2)
-    profile += vp9_rb_read_bit(rb);
-  return (BITSTREAM_PROFILE) profile;
-}
-
 static void read_bitdepth_colorspace_sampling(
-    VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (cm->profile >= PROFILE_2)
-    cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
-  cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
-  if (cm->color_space != SRGB) {
-    vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  if (cm->profile >= PROFILE_2) {
+    cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 1;
+#endif
+  } else {
+    cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 0;
+#endif
+  }
+  cm->color_space = vpx_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      cm->subsampling_x = vp9_rb_read_bit(rb);
-      cm->subsampling_y = vp9_rb_read_bit(rb);
+      cm->subsampling_x = vpx_rb_read_bit(rb);
+      cm->subsampling_y = vpx_rb_read_bit(rb);
       if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "4:2:0 color not supported in profile 1 or 3");
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1119,7 +1791,7 @@
       // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
       // 4:2:2 or 4:4:0 chroma sampling is not allowed.
       cm->subsampling_y = cm->subsampling_x = 0;
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1130,42 +1802,59 @@
 }
 
 static size_t read_uncompressed_header(VP9Decoder *pbi,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  int i, mask, ref_index = 0;
   size_t sz;
-  int i;
 
   cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
 
-  if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
+  if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
 
   cm->profile = vp9_read_profile(rb);
+#if CONFIG_VP9_HIGHBITDEPTH
   if (cm->profile >= MAX_PROFILES)
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Unsupported bitstream profile");
+#else
+  if (cm->profile >= PROFILE_2)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#endif
 
-  cm->show_existing_frame = vp9_rb_read_bit(rb);
+  cm->show_existing_frame = vpx_rb_read_bit(rb);
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
-    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
-    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
+    const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      unlock_buffer_pool(pool);
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
+    }
 
-    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
     return 0;
   }
 
-  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);
-  cm->show_frame = vp9_rb_read_bit(rb);
-  cm->error_resilient_mode = vp9_rb_read_bit(rb);
+  cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb);
+  cm->show_frame = vpx_rb_read_bit(rb);
+  cm->error_resilient_mode = vpx_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
     if (!vp9_read_sync_code(rb))
@@ -1176,16 +1865,20 @@
     pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
     for (i = 0; i < REFS_PER_FRAME; ++i) {
-      cm->frame_refs[i].idx = -1;
+      cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
     }
 
     setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
   } else {
-    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
+    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
     cm->reset_frame_context = cm->error_resilient_mode ?
-        0 : vp9_rb_read_literal(rb, 2);
+        0 : vpx_rb_read_literal(rb, 2);
 
     if (cm->intra_only) {
       if (!vp9_read_sync_code(rb))
@@ -1196,45 +1889,69 @@
       } else {
         // NOTE: The intra-only frame header does not include the specification
         // of either the color format or color sub-sampling in profile 0. VP9
-        // specifies that the default color space should be YUV 4:2:0 in this
+        // specifies that the default color format should be YUV 4:2:0 in this
         // case (normative).
-        cm->color_space = BT_601;
+        cm->color_space = VPX_CS_BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
+        cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+        cm->use_highbitdepth = 0;
+#endif
       }
 
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
-    } else {
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) {  /* Skip if need resync */
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       for (i = 0; i < REFS_PER_FRAME; ++i) {
-        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
         RefBuffer *const ref_frame = &cm->frame_refs[i];
         ref_frame->idx = idx;
-        ref_frame->buf = &cm->frame_bufs[idx].buf;
-        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
+        ref_frame->buf = &frame_bufs[idx].buf;
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
       }
 
       setup_frame_size_with_refs(cm, rb);
 
-      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
+      cm->allow_high_precision_mv = vpx_rb_read_bit(rb);
       cm->interp_filter = read_interp_filter(rb);
 
       for (i = 0; i < REFS_PER_FRAME; ++i) {
         RefBuffer *const ref_buf = &cm->frame_refs[i];
+#if CONFIG_VP9_HIGHBITDEPTH
+        vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                          ref_buf->buf->y_crop_width,
+                                          ref_buf->buf->y_crop_height,
+                                          cm->width, cm->height,
+                                          cm->use_highbitdepth);
+#else
         vp9_setup_scale_factors_for_frame(&ref_buf->sf,
                                           ref_buf->buf->y_crop_width,
                                           ref_buf->buf->y_crop_height,
                                           cm->width, cm->height);
-        if (vp9_is_scaled(&ref_buf->sf))
-          vp9_extend_frame_borders(ref_buf->buf);
+#endif
       }
     }
   }
+#if CONFIG_VP9_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
+
+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
 
   if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = vp9_rb_read_bit(rb);
-    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
+    cm->refresh_frame_context = vpx_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
   } else {
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
@@ -1242,7 +1959,31 @@
 
   // This flag will be overridden by the call to vp9_setup_past_independence
   // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+  cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
 
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp9_setup_past_independence(cm);
@@ -1250,9 +1991,10 @@
   setup_loopfilter(&cm->lf, rb);
   setup_quantization(cm, &pbi->mb, rb);
   setup_segmentation(&cm->seg, rb);
+  setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
-  sz = vp9_rb_read_literal(rb, 16);
+  sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1265,11 +2007,11 @@
                                   size_t partition_size) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  FRAME_CONTEXT *const fc = &cm->fc;
-  vp9_reader r;
+  FRAME_CONTEXT *const fc = cm->fc;
+  vpx_reader r;
   int k;
 
-  if (vp9_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
@@ -1310,19 +2052,7 @@
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
   }
 
-  return vp9_reader_has_error(&r);
-}
-
-void vp9_init_dequantizer(VP9_COMMON *cm) {
-  int q;
-
-  for (q = 0; q < QINDEX_RANGE; q++) {
-    cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q);
-    cm->y_dequant[q][1] = vp9_ac_quant(q, 0);
-
-    cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q);
-    cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q);
-  }
+  return vpx_reader_has_error(&r);
 }
 
 #ifdef NDEBUG
@@ -1362,12 +2092,12 @@
 }
 #endif  // NDEBUG
 
-static struct vp9_read_bit_buffer* init_read_bit_buffer(
+static struct vpx_read_bit_buffer *init_read_bit_buffer(
     VP9Decoder *pbi,
-    struct vp9_read_bit_buffer *rb,
+    struct vpx_read_bit_buffer *rb,
     const uint8_t *data,
     const uint8_t *data_end,
-    uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) {
+    uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
   rb->bit_offset = 0;
   rb->error_handler = error_handler;
   rb->error_handler_data = &pbi->common;
@@ -1383,13 +2113,35 @@
   return rb;
 }
 
+//------------------------------------------------------------------------------
+
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) {
+  return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height) {
+  *width = vpx_rb_read_literal(rb, 16) + 1;
+  *height = vpx_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) {
+  int profile = vpx_rb_read_bit(rb);
+  profile |= vpx_rb_read_bit(rb) << 1;
+  if (profile > 2)
+    profile += vpx_rb_read_bit(rb);
+  return (BITSTREAM_PROFILE) profile;
+}
+
 void vp9_decode_frame(VP9Decoder *pbi,
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
-
+  struct vpx_read_bit_buffer rb;
+  int context_updated = 0;
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
       init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1400,47 +2152,79 @@
 
   if (!first_partition_size) {
     // showing a frame directly
-    *p_data_end = data + 1;
+    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
     return;
   }
 
-  data += vp9_rb_bytes_read(&rb);
+  data += vpx_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  init_macroblockd(cm, &pbi->mb);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->last_intra_only &&
+                           cm->last_show_frame &&
+                           (cm->last_frame_type != KEY_FRAME);
 
-  if (!cm->error_resilient_mode)
-    set_prev_mi(cm);
-  else
-    cm->prev_mi = NULL;
-
-  setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
   vp9_zero(cm->counts);
-  vp9_zero(xd->dqcoeff);
 
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
 
-  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
-  // single-frame tile decoding.
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
-      cm->frame_parallel_decoding_mode) {
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+    // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    // If multiple threads are used to decode tiles, then we use those threads
-    // to do parallel loopfiltering.
-    vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+    if (!xd->corrupted) {
+      if (!cm->skip_loop_filter) {
+        // If multiple threads are used to decode tiles, then we use those
+        // threads to do parallel loopfiltering.
+        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
+                                 cm->lf.filter_level, 0, 0, pbi->tile_workers,
+                                 pbi->num_tile_workers, &pbi->lf_row_sync);
+      }
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Decode failed. Frame data is corrupted.");
+    }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
 
-  new_fb->corrupted |= xd->corrupted;
-
-  if (!new_fb->corrupted) {
+  if (!xd->corrupted) {
     if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
       vp9_adapt_coef_probs(cm);
 
@@ -1456,6 +2240,7 @@
                        "Decode failed. Frame data is corrupted.");
   }
 
-  if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }

diff --git a/libvpx/vp9/decoder/vp9_decodeframe.h b/libvpx/vp9/decoder/vp9_decodeframe.h
index 10a9e34..05af706 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/libvpx/vp9/decoder/vp9_decodeframe.h

@@ -16,21 +16,18 @@
 extern "C" {
 #endif
 
-struct VP9Common;
 struct VP9Decoder;
-struct vp9_read_bit_buffer;
+struct vpx_read_bit_buffer;
 
-void vp9_init_dequantizer(struct VP9Common *cm);
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height);
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb);
 
 void vp9_decode_frame(struct VP9Decoder *pbi,
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end);
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
-                         int *width, int *height);
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 32e80f9..33818a9 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c

@@ -21,61 +21,68 @@
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_reader.h"
 
-static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
+static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p);
 }
 
-static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
-                                            int size_group) {
+static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                         vpx_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
-      read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.y_mode[size_group][y_mode];
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->y_mode[size_group][y_mode];
   return y_mode;
 }
 
-static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                          vpx_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
-                                         cm->fc.uv_mode_prob[y_mode]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.uv_mode[y_mode][uv_mode];
+                                         cm->fc->uv_mode_prob[y_mode]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->uv_mode[y_mode][uv_mode];
   return uv_mode;
 }
 
-static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
-  const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
-                                 cm->fc.inter_mode_probs[ctx]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.inter_mode[ctx][mode];
+static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                       vpx_reader *r, int ctx) {
+  const int mode = vpx_read_tree(r, vp9_inter_mode_tree,
+                                 cm->fc->inter_mode_probs[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->inter_mode[ctx][mode];
 
   return NEARESTMV + mode;
 }
 
-static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
-  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
+static int read_segment_id(vpx_reader *r, const struct segmentation *seg) {
+  return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vp9_reader *r) {
-  const int ctx = vp9_get_tx_size_context(xd);
-  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
-  int tx_size = vp9_read(r, tx_probs[0]);
+                                     TX_SIZE max_tx_size, vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
+  int tx_size = vpx_read(r, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += vp9_read(r, tx_probs[1]);
+    tx_size += vpx_read(r, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += vp9_read(r, tx_probs[2]);
+      tx_size += vpx_read(r, tx_probs[2]);
   }
 
-  if (!cm->frame_parallel_decoding_mode)
-    ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
+  if (counts)
+    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
   return (TX_SIZE)tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
-                            BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int allow_select, vpx_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
     return read_selected_tx_size(cm, xd, max_tx_size, r);
@@ -83,93 +90,134 @@
     return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
-                           int mi_row, int mi_col, int segment_id) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id = MIN(segment_id,
+                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(VP9_COMMON *cm, int mi_offset,
+                           int x_mis, int y_mis, int segment_id) {
   int x, y;
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
-      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
-static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col,
-                                 vp9_reader *r) {
+static void copy_segment_id(const VP9_COMMON *cm,
+                           const uint8_t *last_segment_ids,
+                           uint8_t *current_segment_ids,
+                           int mi_offset, int x_mis, int y_mis) {
+  int x, y;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
+          last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
+}
+
+static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
+                                 int x_mis, int y_mis,
+                                 vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   int segment_id;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
-  if (!seg->update_map)
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
     return 0;
+  }
 
   segment_id = read_segment_id(r, seg);
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vp9_reader *r) {
+                                 int mi_row, int mi_col, vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
   int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
-  predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
-                                            bsize, mi_row, mi_col);
-  if (!seg->update_map)
+  predicted_segment_id = cm->last_frame_seg_map ?
+      dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+      0;
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
     return predicted_segment_id;
+  }
 
   if (seg->temporal_update) {
-    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-    mbmi->seg_id_predicted = vp9_read(r, pred_prob);
+    const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
     segment_id = mbmi->seg_id_predicted ? predicted_segment_id
                                         : read_segment_id(r, seg);
   } else {
     segment_id = read_segment_id(r, seg);
   }
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                     int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+                     int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
-    const int skip = vp9_read(r, cm->fc.skip_probs[ctx]);
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.skip[ctx][skip];
+    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->skip[ctx][skip];
     return skip;
   }
 }
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->mi[-cm->mi_stride];
-  const MODE_INFO *left_mi  = xd->left_available ? xd->mi[-1] : NULL;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
 
-  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+
+  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
+  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
@@ -200,44 +248,45 @@
   mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
 }
 
-static int read_mv_component(vp9_reader *r,
+static int read_mv_component(vpx_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
-  const int sign = vp9_read(r, mvcomp->sign);
-  const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
+  const int sign = vpx_read(r, mvcomp->sign);
+  const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    mag = 0;
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
 
     d = 0;
     for (i = 0; i < n; ++i)
-      d |= vp9_read(r, mvcomp->bits[i]) << i;
+      d |= vpx_read(r, mvcomp->bits[i]) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
-  fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+  fr = vpx_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
-
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
   // Result
-  mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
-      (MV_JOINT_TYPE)vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
+      (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -255,13 +304,14 @@
 
 static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp9_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]);
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.comp_inter[ctx][mode];
+        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
   } else {
     return cm->reference_mode;
@@ -270,14 +320,14 @@
 
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            vp9_reader *r,
+                            vpx_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
-  FRAME_CONTEXT *const fc = &cm->fc;
-  FRAME_COUNTS *const counts = &cm->counts;
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = xd->counts;
 
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id,
-                                                       SEG_LVL_REF_FRAME);
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
@@ -285,20 +335,20 @@
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
-      if (!cm->frame_parallel_decoding_mode)
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
+      if (counts)
         ++counts->comp_ref[ctx][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
       ref_frame[!idx] = cm->comp_var_ref[bit];
     } else if (mode == SINGLE_REFERENCE) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
-      if (!cm->frame_parallel_decoding_mode)
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      if (counts)
         ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
-        if (!cm->frame_parallel_decoding_mode)
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
@@ -314,18 +364,21 @@
 
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
-    VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
+    VP9_COMMON *const cm, MACROBLOCKD *const xd,
+    vpx_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
-      (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
-                                   cm->fc.switchable_interp_prob[ctx]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.switchable_interp[ctx][type];
+      (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
+                                   cm->fc->switchable_interp_prob[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->switchable_interp[ctx][type];
   return type;
 }
 
-static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
-                                       vp9_reader *r) {
+static void read_intra_block_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd, MODE_INFO *mi,
+                                       vpx_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   int i;
@@ -336,24 +389,26 @@
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0);
+        mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0);
       mbmi->mode = mi->bmi[3].as_mode;
       break;
     case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0);
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
       mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode_y(cm, r, 0);
+          read_intra_mode_y(cm, xd, r, 0);
       break;
     case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0);
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
       mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode_y(cm, r, 0);
+          read_intra_mode_y(cm, xd, r, 0);
       break;
     default:
-      mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+      mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
   }
 
-  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -361,19 +416,20 @@
          mv->col > MV_LOW && mv->col < MV_UPP;
 }
 
-static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
+static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
-                            int is_compound, int allow_hp, vp9_reader *r) {
+                            int is_compound, int allow_hp, vpx_reader *r) {
   int i;
   int ret = 1;
 
   switch (mode) {
     case NEWMV: {
-      nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
-                                            NULL : &cm->counts.mv;
+      FRAME_COUNTS *counts = xd->counts;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
       for (i = 0; i < 1 + is_compound; ++i) {
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
                 allow_hp);
         ret = ret && is_mv_valid(&mv[i].as_mv);
       }
@@ -405,62 +461,71 @@
 }
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
-           INTRA_FRAME;
+                               int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
-    const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]);
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.intra_inter[ctx][is_inter];
+    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->intra_inter[ctx][is_inter];
     return is_inter;
   }
 }
 
-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       const TileInfo *const tile,
                                        MODE_INFO *const mi,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-
   int_mv nearestmv[2], nearmv[2];
-  int inter_mode_ctx, ref, is_compound;
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int ref, is_compound;
+  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    const int ref_idx = frame - LAST_FRAME;
-    if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE ||
-        cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE )
-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!vp9_is_valid_scale(&ref_buf->sf)))
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col);
+    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                         &ref_buf->sf);
+    vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
+                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
-  inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
-
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
-        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid usage of segement feature on small blocks");
         return;
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+      mbmi->mode = read_inter_mode(cm, xd, r,
+                                   inter_mode_ctx[mbmi->ref_frame[0]]);
   }
 
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     for (ref = 0; ref < 1 + is_compound; ++ref) {
-      vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
+      vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]],
                             &nearestmv[ref], &nearmv[ref]);
     }
   }
@@ -470,8 +535,8 @@
                       : cm->interp_filter;
 
   if (bsize < BLOCK_8X8) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv nearest_sub8x8[2], near_sub8x8[2];
@@ -479,20 +544,23 @@
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
 
-        if (b_mode == NEARESTMV || b_mode == NEARMV)
+        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
           for (ref = 0; ref < 1 + is_compound; ++ref)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
                                           &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref]);
+                                          &near_sub8x8[ref],
+                                          dummy_mode_ctx);
+        }
 
-        if (!assign_mv(cm, b_mode, block, nearestmv,
+        if (!assign_mv(cm, xd, b_mode, block, nearestmv,
                        nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
-        };
+        }
 
         mi->bmi[j].as_mv[0].as_int = block[0].as_int;
         if (is_compound)
@@ -510,15 +578,15 @@
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv,
                                 nearestmv, nearmv, is_compound, allow_hp, r);
   }
 }
 
-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       const TileInfo *const tile,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
@@ -528,20 +596,36 @@
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
-                               !mbmi->skip || !inter_block, r);
+  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
   else
-    read_intra_block_mode_info(cm, mi, r);
+    read_intra_block_mode_info(cm, xd, mi, r);
 }
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
-                        const TileInfo *const tile,
-                        int mi_row, int mi_col, vp9_reader *r) {
-  if (frame_is_intra_only(cm))
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis) {
+  VP9_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
-  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  } else {
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      }
+    }
+  }
 }

diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index 7394b62..75f568c 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h

@@ -11,17 +11,17 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_dsp/bitreader.h"
+
+#include "vp9/decoder/vp9_decoder.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct TileInfo;
-
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
-                        const struct TileInfo *const tile,
-                        int mi_row, int mi_col, vp9_reader *r);
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c
index 2a2f0f5..6734d00 100644
--- a/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libvpx/vp9/decoder/vp9_decoder.c

@@ -12,11 +12,16 @@
 #include <limits.h>
 #include <stdio.h>
 
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
@@ -25,25 +30,52 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_reconintra.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_dthread.h"
 
-static void initialize_dec() {
-  static int init_done = 0;
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
-    vp9_init_neighbors();
+    vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp9_init_intra_predictors();
     init_done = 1;
   }
 }
 
-VP9Decoder *vp9_decoder_create() {
-  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
-  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
+static void vp9_dec_setup_mi(VP9_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  return 0;
+}
+
+static void vp9_dec_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+}
+
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
+  VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+  VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -57,52 +89,58 @@
   }
 
   cm->error.setjmp = 1;
-  initialize_dec();
 
-  vp9_rtcd();
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
+  pbi->need_resync = 1;
+  once(initialize_dec);
 
   // Initialize the references to not point to any frame buffers.
-  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;
 
-  // vp9_init_dequantizer() is first called here. Add check in
-  // frame_init_dequantizer() to avoid unnecessary calling of
-  // vp9_init_dequantizer() for every frame.
-  vp9_init_dequantizer(cm);
+  cm->bit_depth = VPX_BITS_8;
+  cm->dequant_bit_depth = VPX_BITS_8;
+
+  cm->alloc_mi = vp9_dec_alloc_mi;
+  cm->free_mi = vp9_dec_free_mi;
+  cm->setup_mi = vp9_dec_setup_mi;
 
   vp9_loop_filter_init(cm);
 
   cm->error.setjmp = 0;
 
-  vp9_get_worker_interface()->init(&pbi->lf_worker);
+  vpx_get_worker_interface()->init(&pbi->lf_worker);
 
   return pbi;
 }
 
 void vp9_decoder_remove(VP9Decoder *pbi) {
-  VP9_COMMON *const cm = &pbi->common;
   int i;
 
-  vp9_get_worker_interface()->end(&pbi->lf_worker);
+  vpx_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi->tile_data);
   for (i = 0; i < pbi->num_tile_workers; ++i) {
-    VP9Worker *const worker = &pbi->tile_workers[i];
-    vp9_get_worker_interface()->end(worker);
-    vpx_free(worker->data1);
-    vpx_free(worker->data2);
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    vpx_get_worker_interface()->end(worker);
   }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_worker_info);
   vpx_free(pbi->tile_workers);
 
-  if (pbi->num_tile_workers) {
-    const int sb_rows =
-        mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-    vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows);
+  if (pbi->num_tile_workers > 0) {
+    vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
-  vp9_remove_common(cm);
   vpx_free(pbi);
 }
 
@@ -123,8 +161,12 @@
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return VPX_CODEC_ERROR;
+    }
     if (!equal_dimensions(cfg, sd))
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
@@ -143,6 +185,7 @@
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
   RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
   // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
   // encoder is using the frame buffers for. This is just a stub to keep the
@@ -168,13 +211,16 @@
 
     // Find an empty frame buffer.
     const int free_fb = get_free_fb(cm);
+    if (cm->new_fb_idx == INVALID_IDX)
+      return VPX_CODEC_MEM_ERROR;
+
     // Decrease ref_count since it will be increased again in
     // ref_cnt_fb() below.
-    cm->frame_bufs[free_fb].ref_count--;
+    --frame_bufs[free_fb].ref_count;
 
     // Manage the reference counters and copy image.
-    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
     vp8_yv12_copy_frame(sd, ref_buf->buf);
   }
 
@@ -185,33 +231,51 @@
 static void swap_frame_buffers(VP9Decoder *pbi) {
   int ref_index = 0, mask;
   VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
+  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
-                 cm->new_fb_idx);
-      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
-        cm->release_fb_cb(cm->cb_priv,
-                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
     }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
 
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
-    cm->frame_refs[ref_index].idx = INT_MAX;
+    cm->frame_refs[ref_index].idx = -1;
 }
 
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                 size_t size, const uint8_t **psource) {
-  VP9_COMMON *const cm = &pbi->common;
+  VP9_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
   const uint8_t *source = *psource;
   int retcode = 0;
-
   cm->error.error_code = VPX_CODEC_OK;
 
   if (size == 0) {
@@ -223,57 +287,124 @@
     // TODO(jkoleszar): Error concealment is undefined and non-normative
     // at this point, but if it becomes so, [0] may not always be the correct
     // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
       cm->frame_refs[0].buf->corrupted = 1;
+    }
   }
 
+  pbi->ready_for_new_data = 0;
+
   // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
-    cm->release_fb_cb(cm->cb_priv,
-                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX)
+    return VPX_CODEC_MEM_ERROR;
+
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  pbi->hold_ref_buf = 0;
+  if (pbi->frame_parallel_decode) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
 
   if (setjmp(cm->error.jmp)) {
+    const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+    int i;
+
     cm->error.setjmp = 0;
-    vp9_clear_system_state();
+    pbi->ready_for_new_data = 1;
 
-    // We do not know if the missing frame(s) was supposed to update
-    // any of the reference buffers, but we act conservative and
-    // mark only the last buffer as corrupted.
-    //
-    // TODO(jkoleszar): Error concealment is undefined and non-normative
-    // at this point, but if it becomes so, [0] may not always be the correct
-    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
-      cm->frame_refs[0].buf->corrupted = 1;
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_tile_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
 
-    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
 
+        // Release the reference frame in reference map.
+        if ((mask & 1) && old_idx >= 0) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+
+    vpx_clear_system_state();
     return -1;
   }
 
   cm->error.setjmp = 1;
-
   vp9_decode_frame(pbi, source, source + size, psource);
 
   swap_frame_buffers(pbi);
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
-  if (!cm->show_existing_frame)
+  if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);
-
-    cm->current_video_frame++;
+    cm->prev_frame = cm->cur_frame;
+    if (cm->seg.enabled && !pbi->frame_parallel_decode)
+      vp9_swap_current_and_last_seg_map(cm);
   }
 
-  pbi->ready_for_new_data = 0;
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+  }
 
   cm->error.setjmp = 0;
   return retcode;
@@ -290,6 +421,8 @@
   if (pbi->ready_for_new_data == 1)
     return ret;
 
+  pbi->ready_for_new_data = 1;
+
   /* no raw frame to show!!! */
   if (!cm->show_frame)
     return ret;
@@ -307,6 +440,70 @@
   *sd = *cm->frame_to_show;
   ret = 0;
 #endif /*!CONFIG_POSTPROC*/
-  vp9_clear_system_state();
+  vpx_clear_system_state();
   return ret;
 }
+
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}

diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h
index 223b66f..915f9dc 100644
--- a/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libvpx/vp9/decoder/vp9_decoder.h

@@ -14,12 +14,13 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/common/vp9_thread.h"
-
 #include "vp9/decoder/vp9_dthread.h"
 
 #ifdef __cplusplus
@@ -29,10 +30,22 @@
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
   VP9_COMMON *cm;
-  vp9_reader bit_reader;
+  vpx_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
 } TileData;
 
+typedef struct TileWorkerData {
+  struct VP9Decoder *pbi;
+  vpx_reader bit_reader;
+  FRAME_COUNTS counts;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -44,8 +57,15 @@
 
   int frame_parallel_decode;  // frame-based threading.
 
-  VP9Worker lf_worker;
-  VP9Worker *tile_workers;
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+
+  VPxWorker *frame_worker_owner;   // frame_worker that owns this pbi.
+  VPxWorker lf_worker;
+  VPxWorker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
   int num_tile_workers;
 
   TileData *tile_data;
@@ -58,6 +78,8 @@
 
   int max_threads;
   int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -74,10 +96,44 @@
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd);
 
-struct VP9Decoder *vp9_decoder_create();
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+                                  void *decrypt_state,
+                                  const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index 91cdf38..e4412dc 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c

@@ -13,6 +13,10 @@
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
 
 #include "vp9/decoder/vp9_detokenize.h"
 
@@ -30,60 +34,95 @@
 
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
-     if (!cm->frame_parallel_decoding_mode)                 \
+     if (counts)                                            \
        ++coef_counts[band][ctx][token];                     \
   } while (0)
 
-#define WRITE_COEF_CONTINUE(val, token)                  \
-  {                                                      \
-    v = (val * dqv) >> dq_shift;                         \
-    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;         \
-    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
-    ++c;                                                 \
-    ctx = get_coef_context(nb, token_cache, c);          \
-    dqv = dq[1];                                         \
-    continue;                                            \
-  }
+static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | vpx_read(r, probs[i]);
+  return val;
+}
 
-#define ADJUST_COEF(prob, bits_count)                   \
-  do {                                                  \
-    val += (vp9_read(r, prob) << bits_count);           \
-  } while (0)
-
-static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
-                       int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
-                       int ctx, const int16_t *scan, const int16_t *nb,
-                       vp9_reader *r) {
+static int decode_coefs(const MACROBLOCKD *xd,
+                        PLANE_TYPE type,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
-  const FRAME_CONTEXT *const fc = &cm->fc;
-  FRAME_COUNTS *const counts = &cm->counts;
+  const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
-  const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
-  const vp9_prob *prob;
-  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] =
-      counts->coef[tx_size][type][ref];
-  unsigned int (*eob_branch_count)[COEFF_CONTEXTS] =
-      counts->eob_branch[tx_size][type][ref];
+  const vpx_prob *prob;
+  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
+  unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
   uint8_t token_cache[32 * 32];
   const uint8_t *band_translate = get_band_translate(tx_size);
   const int dq_shift = (tx_size == TX_32X32);
-  int v;
+  int v, token;
   int16_t dqv = dq[0];
+  const uint8_t *cat1_prob;
+  const uint8_t *cat2_prob;
+  const uint8_t *cat3_prob;
+  const uint8_t *cat4_prob;
+  const uint8_t *cat5_prob;
+  const uint8_t *cat6_prob;
+
+  if (counts) {
+    coef_counts = counts->coef[tx_size][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->bd > VPX_BITS_8) {
+    if (xd->bd == VPX_BITS_10) {
+      cat1_prob = vp9_cat1_prob_high10;
+      cat2_prob = vp9_cat2_prob_high10;
+      cat3_prob = vp9_cat3_prob_high10;
+      cat4_prob = vp9_cat4_prob_high10;
+      cat5_prob = vp9_cat5_prob_high10;
+      cat6_prob = vp9_cat6_prob_high10;
+    } else {
+      cat1_prob = vp9_cat1_prob_high12;
+      cat2_prob = vp9_cat2_prob_high12;
+      cat3_prob = vp9_cat3_prob_high12;
+      cat4_prob = vp9_cat4_prob_high12;
+      cat5_prob = vp9_cat5_prob_high12;
+      cat6_prob = vp9_cat6_prob_high12;
+    }
+  } else {
+    cat1_prob = vp9_cat1_prob;
+    cat2_prob = vp9_cat2_prob;
+    cat3_prob = vp9_cat3_prob;
+    cat4_prob = vp9_cat4_prob;
+    cat5_prob = vp9_cat5_prob;
+    cat6_prob = vp9_cat6_prob;
+  }
+#else
+  cat1_prob = vp9_cat1_prob;
+  cat2_prob = vp9_cat2_prob;
+  cat3_prob = vp9_cat3_prob;
+  cat4_prob = vp9_cat4_prob;
+  cat5_prob = vp9_cat5_prob;
+  cat6_prob = vp9_cat6_prob;
+#endif
 
   while (c < max_eob) {
-    int val;
+    int val = -1;
     band = *band_translate++;
     prob = coef_probs[band][ctx];
-    if (!cm->frame_parallel_decoding_mode)
+    if (counts)
       ++eob_branch_count[band][ctx];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -95,97 +134,135 @@
       prob = coef_probs[band][ctx];
     }
 
-    // ONE_CONTEXT_NODE_0_
-    if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
-      WRITE_COEF_CONTINUE(1, ONE_TOKEN);
-    }
-
-    INCREMENT_COUNT(TWO_TOKEN);
-
-    prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
-
-    if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(2, TWO_TOKEN);
+      token = ONE_TOKEN;
+      val = 1;
+    } else {
+      INCREMENT_COUNT(TWO_TOKEN);
+      token = vpx_read_tree(r, vp9_coef_con_tree,
+                            vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
+      switch (token) {
+        case TWO_TOKEN:
+        case THREE_TOKEN:
+        case FOUR_TOKEN:
+          val = token;
+          break;
+        case CATEGORY1_TOKEN:
+          val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, r);
+          break;
+        case CATEGORY2_TOKEN:
+          val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, r);
+          break;
+        case CATEGORY3_TOKEN:
+          val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, r);
+          break;
+        case CATEGORY4_TOKEN:
+          val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, r);
+          break;
+        case CATEGORY5_TOKEN:
+          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
+          break;
+        case CATEGORY6_TOKEN:
+#if CONFIG_VP9_HIGHBITDEPTH
+          switch (xd->bd) {
+            case VPX_BITS_8:
+              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+              break;
+            case VPX_BITS_10:
+              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
+              break;
+            case VPX_BITS_12:
+              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
+              break;
+            default:
+              assert(0);
+              return -1;
+          }
+#else
+          val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+#endif
+          break;
       }
-      if (!vp9_read(r, prob[THREE_CONTEXT_NODE])) {
-        WRITE_COEF_CONTINUE(3, THREE_TOKEN);
-      }
-      WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
-
-    if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
-        val = CAT1_MIN_VAL;
-        ADJUST_COEF(vp9_cat1_prob[0], 0);
-        WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN);
-      }
-      val = CAT2_MIN_VAL;
-      ADJUST_COEF(vp9_cat2_prob[0], 1);
-      ADJUST_COEF(vp9_cat2_prob[1], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN);
-    }
-
-    if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
-      if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
-        val = CAT3_MIN_VAL;
-        ADJUST_COEF(vp9_cat3_prob[0], 2);
-        ADJUST_COEF(vp9_cat3_prob[1], 1);
-        ADJUST_COEF(vp9_cat3_prob[2], 0);
-        WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN);
-      }
-      val = CAT4_MIN_VAL;
-      ADJUST_COEF(vp9_cat4_prob[0], 3);
-      ADJUST_COEF(vp9_cat4_prob[1], 2);
-      ADJUST_COEF(vp9_cat4_prob[2], 1);
-      ADJUST_COEF(vp9_cat4_prob[3], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN);
-    }
-
-    if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
-      val = CAT5_MIN_VAL;
-      ADJUST_COEF(vp9_cat5_prob[0], 4);
-      ADJUST_COEF(vp9_cat5_prob[1], 3);
-      ADJUST_COEF(vp9_cat5_prob[2], 2);
-      ADJUST_COEF(vp9_cat5_prob[3], 1);
-      ADJUST_COEF(vp9_cat5_prob[4], 0);
-      WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN);
-    }
-    val = 0;
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[0]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[1]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[2]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[3]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[4]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[5]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[6]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[7]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[8]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[9]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[10]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[11]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[12]);
-    val = (val << 1) | vp9_read(r, vp9_cat6_prob[13]);
-    val += CAT6_MIN_VAL;
-
-    WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN);
+    v = (val * dqv) >> dq_shift;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
+                                          xd->bd);
+#else
+    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
   }
 
   return c;
 }
 
-int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r) {
+// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = pd->n4_w +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = pd->n4_h +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+int vp9_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx = get_entropy_context(tx_size, pd->above_context + x,
                                                pd->left_context + y);
-  const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
-  const int eob = decode_coefs(cm, xd, pd->plane_type,
-                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               pd->dequant, ctx, so->scan, so->neighbors, r);
-  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
+  const int eob = decode_coefs(xd, pd->plane_type,
+                               pd->dqcoeff, tx_size,
+                               dequant, ctx, sc->scan, sc->neighbors, r);
+  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
 

diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index 5278e97..d242d44 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h

@@ -12,16 +12,19 @@
 #ifndef VP9_DECODER_VP9_DETOKENIZE_H_
 #define VP9_DECODER_VP9_DETOKENIZE_H_
 
+#include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
+#include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r);
+int vp9_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c
index c22617e..4fbc6db 100644
--- a/libvpx/vp9/decoder/vp9_dsubexp.c
+++ b/libvpx/vp9/decoder/vp9_dsubexp.c

@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "vp9/common/vp9_entropy.h"
 
 #include "vp9/decoder/vp9_dsubexp.h"
@@ -16,61 +18,59 @@
   if (v > 2 * m)
     return v;
 
-  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(vp9_reader *r) {
+static int decode_uniform(vpx_reader *r) {
   const int l = 8;
   const int m = (1 << l) - 191;
-  const int v = vp9_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+  const int v = vpx_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
-  static int inv_map_table[MAX_PROB - 1] = {
-      6,  19,  32,  45,  58,  71,  84,  97, 110, 123, 136, 149, 162, 175, 188,
-    201, 214, 227, 240, 253,   0,   1,   2,   3,   4,   5,   7,   8,   9,  10,
-     11,  12,  13,  14,  15,  16,  17,  18,  20,  21,  22,  23,  24,  25,  26,
-     27,  28,  29,  30,  31,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
-     43,  44,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  59,
-     60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  72,  73,  74,  75,
-     76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,  88,  89,  90,  91,
-     92,  93,  94,  95,  96,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
-    108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124,
-    125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
-    141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
-    157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
-    173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189,
-    190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
-    206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
-    222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
-    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252
+  static int inv_map_table[MAX_PROB] = {
+      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
+    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
+     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
+     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
+     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
+     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
+     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
+    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
+    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
+    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
+    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
+    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253
   };
-  // The clamp is not necessary for conforming VP9 stream, it is added to
-  // prevent out of bound access for bad input data
-  v = clamp(v, 0, 253);
+  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
   v = inv_map_table[v];
   m--;
   if ((m << 1) <= MAX_PROB) {
-    return 1 + inv_recenter_nonneg(v + 1, m);
+    return 1 + inv_recenter_nonneg(v, m);
   } else {
-    return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m);
+    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
   }
 }
 
-static int decode_term_subexp(vp9_reader *r) {
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4);
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4) + 16;
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 5) + 32;
+static int decode_term_subexp(vpx_reader *r) {
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4);
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4) + 16;
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 5) + 32;
   return decode_uniform(r) + 64;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
-  if (vp9_read(r, DIFF_UPDATE_PROB)) {
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p) {
+  if (vpx_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r);
-    *p = (vp9_prob)inv_remap_prob(delp, *p);
+    *p = (vpx_prob)inv_remap_prob(delp, *p);
   }
 }

diff --git a/libvpx/vp9/decoder/vp9_dsubexp.h b/libvpx/vp9/decoder/vp9_dsubexp.h
index 436f434..a8bcc70 100644
--- a/libvpx/vp9/decoder/vp9_dsubexp.h
+++ b/libvpx/vp9/decoder/vp9_dsubexp.h

@@ -12,13 +12,13 @@
 #ifndef VP9_DECODER_VP9_DSUBEXP_H_
 #define VP9_DECODER_VP9_DSUBEXP_H_
 
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_dsp/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/decoder/vp9_dthread.c b/libvpx/vp9/decoder/vp9_dthread.c
index 5dda49a..14a7144 100644
--- a/libvpx/vp9/decoder/vp9_dthread.c
+++ b/libvpx/vp9/decoder/vp9_dthread.c

@@ -9,273 +9,181 @@
  */
 
 #include "./vpx_config.h"
-
 #include "vpx_mem/vpx_mem.h"
-
 #include "vp9/common/vp9_reconinter.h"
-
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_decoder.h"
 
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
-  const int kMaxTryLocks = 4000;
-  int locked = 0;
-  int i;
-
-  for (i = 0; i < kMaxTryLocks; ++i) {
-    if (!pthread_mutex_trylock(mutex)) {
-      locked = 1;
-      break;
-    }
-  }
-
-  if (!locked)
-    pthread_mutex_lock(mutex);
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
 }
-#endif  // CONFIG_MULTITHREAD
 
-static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  const int nsync = lf_sync->sync_range;
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
 
-  if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
-    mutex_lock(mutex);
+void vp9_frameworker_signal_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
 
-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+  (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
+
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
     }
-    pthread_mutex_unlock(mutex);
+#endif
+
+    vp9_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_frameworker_unlock_stats(ref_worker);
+      vpx_internal_error(&worker_data->pbi->common.error,
+                         VPX_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    vp9_frameworker_unlock_stats(ref_worker);
   }
 #else
-  (void)lf_sync;
-  (void)r;
-  (void)c;
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
 #endif  // CONFIG_MULTITHREAD
 }
 
-static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
-                              const int sb_cols) {
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
 #if CONFIG_MULTITHREAD
-  const int nsync = lf_sync->sync_range;
-  int cur;
-  // Only signal when there are enough filtered SB for next row to run.
-  int sig = 1;
+  VPxWorker *worker = buf->frame_worker_owner;
 
-  if (c < sb_cols - 1) {
-    cur = c;
-    if (c % nsync)
-      sig = 0;
-  } else {
-    cur = sb_cols + nsync;
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
   }
+#endif
 
-  if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
-
-    lf_sync->cur_sb_col[r] = cur;
-
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
-  }
+  vp9_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp9_frameworker_signal_stats(worker);
+  vp9_frameworker_unlock_stats(worker);
 #else
-  (void)lf_sync;
-  (void)r;
-  (void)c;
-  (void)sb_cols;
+  (void)buf;
+  (void)row;
 #endif  // CONFIG_MULTITHREAD
 }
 
-// Implement row loopfiltering for each thread.
-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
-                                VP9_COMMON *const cm,
-                                struct macroblockd_plane planes[MAX_MB_PLANE],
-                                int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync, int num_lf_workers) {
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int r, c;  // SB row and col
-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
-
-  for (r = start; r < stop; r += num_lf_workers) {
-    const int mi_row = r << MI_BLOCK_SIZE_LOG2;
-    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
-    for (c = 0; c < sb_cols; ++c) {
-      const int mi_col = c << MI_BLOCK_SIZE_LOG2;
-      LOOP_FILTER_MASK lfm;
-      int plane;
-
-      sync_read(lf_sync, r, c);
-
-      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
-
-      for (plane = 0; plane < num_planes; ++plane) {
-        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-      }
-
-      sync_write(lf_sync, r, c, sb_cols);
-    }
-  }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
-  LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void) arg2;
-  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only,
-                      lf_data->lf_sync, lf_data->num_lf_workers);
-  return 1;
-}
-
-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
-// threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              VP9Decoder *pbi, VP9_COMMON *cm,
-                              int frame_filter_level,
-                              int y_only) {
-  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
-  // Number of superblock rows and cols
-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
   int i;
 
-  // Allocate memory used in thread synchronization.
-  // This always needs to be done even if frame_filter_level is 0.
-  if (!cm->current_video_frame || cm->last_height != cm->height) {
-    if (cm->last_height != cm->height) {
-      const int aligned_last_height =
-          ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
-      const int last_sb_rows =
-          mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
-          MI_BLOCK_SIZE_LOG2;
-
-      vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
-    }
-
-    vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
+  // Wait until source frame's context is ready.
+  vp9_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
   }
 
-  if (!frame_filter_level) return;
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled ?
+      src_cm->current_frame_seg_map : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  vp9_frameworker_unlock_stats(src_worker);
 
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+  dst_cm->prev_frame = src_cm->show_existing_frame ?
+                       src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
 
-  // Initialize cur_sb_col to -1 for all SB rows.
-  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-
-  // Set up loopfilter thread data.
-  // The decoder is using num_workers instead of pbi->num_tile_workers
-  // because it has been observed that using more threads on the
-  // loopfilter, than there are tile columns in the frame will hurt
-  // performance on Android. This is because the system will only
-  // schedule the tile decode workers on cores equal to the number
-  // of tile columns. Then if the decoder tries to use more threads for the
-  // loopfilter, it will hurt performance because of contention. If the
-  // multithreading code changes in the future then the number of workers
-  // used by the loopfilter should be revisited.
-  for (i = 0; i < num_workers; ++i) {
-    VP9Worker *const worker = &pbi->tile_workers[i];
-    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-    LFWorkerData *const lf_data = &tile_data->lfdata;
-
-    worker->hook = (VP9WorkerHook)loop_filter_row_worker;
-
-    // Loopfilter data
-    lf_data->frame_buffer = frame;
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->start = i;
-    lf_data->stop = sb_rows;
-    lf_data->y_only = y_only;   // always do all planes in decoder
-
-    lf_data->lf_sync = lf_sync;
-    lf_data->num_lf_workers = num_workers;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
-    }
-  }
-
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&pbi->tile_workers[i]);
-  }
-}
-
-// Set up nsync by width.
-static int get_sync_range(int width) {
-  // nsync numbers are picked by testing. For example, for 4k
-  // video, using 4 gives best performance.
-  if (width < 640)
-    return 1;
-  else if (width <= 1280)
-    return 2;
-  else if (width <= 4096)
-    return 4;
-  else
-    return 8;
-}
-
-// Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
-                           int width) {
-#if CONFIG_MULTITHREAD
-  int i;
-
-  CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                  vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-  for (i = 0; i < rows; ++i) {
-    pthread_mutex_init(&lf_sync->mutex_[i], NULL);
-  }
-
-  CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                  vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-  for (i = 0; i < rows; ++i) {
-    pthread_cond_init(&lf_sync->cond_[i], NULL);
-  }
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
 #endif  // CONFIG_MULTITHREAD
-
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
-                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
-
-  // Set up nsync.
-  lf_sync->sync_range = get_sync_range(width);
-}
-
-// Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
-#if !CONFIG_MULTITHREAD
-  (void)rows;
-#endif  // !CONFIG_MULTITHREAD
-
-  if (lf_sync != NULL) {
-#if CONFIG_MULTITHREAD
-    int i;
-
-    if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
-      }
-      vpx_free(lf_sync->mutex_);
-    }
-    if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
-      }
-      vpx_free(lf_sync->cond_);
-    }
-#endif  // CONFIG_MULTITHREAD
-    vpx_free(lf_sync->cur_sb_col);
-    // clear the structure as the source of this call may be a resize in which
-    // case this call will be followed by an _alloc() which may fail.
-    vp9_zero(*lf_sync);
-  }
 }

diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h
index 423bd88..f6cdccd 100644
--- a/libvpx/vp9/decoder/vp9_dthread.h
+++ b/libvpx/vp9/decoder/vp9_dthread.h

@@ -12,46 +12,55 @@
 #define VP9_DECODER_VP9_DTHREAD_H_
 
 #include "./vpx_config.h"
-#include "vp9/common/vp9_thread.h"
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_util/vpx_thread.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 struct VP9Common;
 struct VP9Decoder;
 
-typedef struct TileWorkerData {
-  struct VP9Common *cm;
-  vp9_reader bit_reader;
-  DECLARE_ALIGNED(16, struct macroblockd, xd);
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+  int received_frame;
 
-  // Row-based parallel loopfilter data
-  LFWorkerData lfdata;
-} TileWorkerData;
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
 
-// Loopfilter row synchronization
-typedef struct VP9LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
 #endif
-  // Allocate memory to store the loop-filtered superblock index in each row.
-  int *cur_sb_col;
-  // The optimal sync_range for different resolution and platform should be
-  // determined by testing. Currently, it is chosen to be a power-of-2 number.
-  int sync_range;
-} VP9LfSync;
 
-// Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
-                           int rows, int width);
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
 
-// Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows);
+void vp9_frameworker_lock_stats(VPxWorker *const worker);
+void vp9_frameworker_unlock_stats(VPxWorker *const worker);
+void vp9_frameworker_signal_stats(VPxWorker *const worker);
 
-// Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              struct VP9Decoder *pbi,
-                              struct VP9Common *cm,
-                              int frame_filter_level,
-                              int y_only);
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker);
 
 #endif  // VP9_DECODER_VP9_DTHREAD_H_

diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.c b/libvpx/vp9/decoder/vp9_read_bit_buffer.c
deleted file mode 100644
index 3eef728..0000000
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.c
+++ /dev/null

@@ -1,41 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "vp9/decoder/vp9_read_bit_buffer.h"
-
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
-}
-
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
-  const size_t off = rb->bit_offset;
-  const size_t p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
-  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  } else {
-    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
-    rb->bit_offset = off + 1;
-    return bit;
-  }
-}
-
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
-  int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    value |= vp9_rb_read_bit(rb) << bit;
-  return value;
-}
-
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
-                               int bits) {
-  const int value = vp9_rb_read_literal(rb, bits);
-  return vp9_rb_read_bit(rb) ? -value : value;
-}

diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
deleted file mode 100644
index fc88bd7..0000000
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-
-#include <limits.h>
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*vp9_rb_error_handler)(void *data);
-
-struct vp9_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  size_t bit_offset;
-
-  void *error_handler_data;
-  vp9_rb_error_handler error_handler;
-};
-
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_

diff --git a/libvpx/vp9/decoder/vp9_reader.c b/libvpx/vp9/decoder/vp9_reader.c
deleted file mode 100644
index 6bb4f9f..0000000
--- a/libvpx/vp9/decoder/vp9_reader.c
+++ /dev/null

@@ -1,106 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp9/decoder/vp9_reader.h"
-
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define LOTS_OF_BITS 0x40000000
-
-int vp9_reader_init(vp9_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state) {
-  if (size && !buffer) {
-    return 1;
-  } else {
-    r->buffer_end = buffer + size;
-    r->buffer = buffer;
-    r->value = 0;
-    r->count = -8;
-    r->range = 255;
-    r->decrypt_cb = decrypt_cb;
-    r->decrypt_state = decrypt_state;
-    vp9_reader_fill(r);
-    return vp9_read_bit(r) != 0;  // marker bit
-  }
-}
-
-void vp9_reader_fill(vp9_reader *r) {
-  const uint8_t *const buffer_end = r->buffer_end;
-  const uint8_t *buffer = r->buffer;
-  const uint8_t *buffer_start = buffer;
-  BD_VALUE value = r->value;
-  int count = r->count;
-  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
-  int loop_end = 0;
-  const size_t bytes_left = buffer_end - buffer;
-  const size_t bits_left = bytes_left * CHAR_BIT;
-  const int x = (int)(shift + CHAR_BIT - bits_left);
-
-  if (r->decrypt_cb) {
-    size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
-    r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
-    buffer = r->clear_buffer;
-    buffer_start = r->clear_buffer;
-  }
-
-  if (x >= 0) {
-    count += LOTS_OF_BITS;
-    loop_end = x;
-  }
-
-  if (x < 0 || bits_left) {
-    while (shift >= loop_end) {
-      count += CHAR_BIT;
-      value |= (BD_VALUE)*buffer++ << shift;
-      shift -= CHAR_BIT;
-    }
-  }
-
-  // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
-  // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
-  // assign 'buffer' to 'r->buffer'.
-  r->buffer += buffer - buffer_start;
-  r->value = value;
-  r->count = count;
-}
-
-const uint8_t *vp9_reader_find_end(vp9_reader *r) {
-  // Find the end of the coded buffer
-  while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
-    r->count -= CHAR_BIT;
-    r->buffer--;
-  }
-  return r->buffer;
-}
-
-int vp9_reader_has_error(vp9_reader *r) {
-  // Check if we have reached the end of the buffer.
-  //
-  // Variable 'count' stores the number of bits in the 'value' buffer, minus
-  // 8. The top byte is part of the algorithm, and the remainder is buffered
-  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-  // occupied, 8 for the algorithm and 8 in the buffer.
-  //
-  // When reading a byte from the user's buffer, count is filled with 8 and
-  // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with LOTS_OF_BITS. So when
-  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
-  //
-  // 1 if we have tried to decode bits after the end of stream was encountered.
-  // 0 No error.
-  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
-}

diff --git a/libvpx/vp9/decoder/vp9_reader.h b/libvpx/vp9/decoder/vp9_reader.h
deleted file mode 100644
index 2d9eccf..0000000
--- a/libvpx/vp9/decoder/vp9_reader.h
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_READER_H_
-#define VP9_DECODER_VP9_READER_H_
-
-#include <stddef.h>
-#include <limits.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vp8dx.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef size_t BD_VALUE;
-
-#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
-
-typedef struct {
-  const uint8_t *buffer_end;
-  const uint8_t *buffer;
-  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-  BD_VALUE value;
-  int count;
-  unsigned int range;
-  vpx_decrypt_cb decrypt_cb;
-  void *decrypt_state;
-} vp9_reader;
-
-int vp9_reader_init(vp9_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state);
-
-void vp9_reader_fill(vp9_reader *r);
-
-int vp9_reader_has_error(vp9_reader *r);
-
-const uint8_t *vp9_reader_find_end(vp9_reader *r);
-
-static INLINE int vp9_read(vp9_reader *r, int prob) {
-  unsigned int bit = 0;
-  BD_VALUE value;
-  BD_VALUE bigsplit;
-  int count;
-  unsigned int range;
-  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
-
-  if (r->count < 0)
-    vp9_reader_fill(r);
-
-  value = r->value;
-  count = r->count;
-
-  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
-
-  range = split;
-
-  if (value >= bigsplit) {
-    range = r->range - split;
-    value = value - bigsplit;
-    bit = 1;
-  }
-
-  {
-    register unsigned int shift = vp9_norm[range];
-    range <<= shift;
-    value <<= shift;
-    count -= shift;
-  }
-  r->value = value;
-  r->count = count;
-  r->range = range;
-
-  return bit;
-}
-
-static INLINE int vp9_read_bit(vp9_reader *r) {
-  return vp9_read(r, 128);  // vp9_prob_half
-}
-
-static INLINE int vp9_read_literal(vp9_reader *r, int bits) {
-  int literal = 0, bit;
-
-  for (bit = bits - 1; bit >= 0; bit--)
-    literal |= vp9_read_bit(r) << bit;
-
-  return literal;
-}
-
-static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
-                                const vp9_prob *probs) {
-  vp9_tree_index i = 0;
-
-  while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
-    continue;
-
-  return -i;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_READER_H_

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 0000000..d569ec9
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c

@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
+
+void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+  const int shift_factor = ((height >> 5) + 3) * -1;
+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+  for (i = 0; i < height; i += 8) {
+    const uint8x16_t vec_row1 = vld1q_u8(ref);
+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+    ref += ref_stride * 8;
+  }
+
+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+  int i;
+  uint16x8_t vec_sum = vdupq_n_u16(0);
+
+  for (i = 0; i < width; i += 16) {
+    const uint8x16_t vec_row = vld1q_u8(ref);
+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+    ref += 16;
+  }
+
+  return horizontal_add_u16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+  int width = 4 << bwl;
+  int32x4_t sse = vdupq_n_s32(0);
+  int16x8_t total = vdupq_n_s16(0);
+
+  assert(width >= 8);
+  assert((width % 8) == 0);
+
+  do {
+    const int16x8_t r = vld1q_s16(ref);
+    const int16x8_t s = vld1q_s16(src);
+    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
+    sse = vmlal_s16(sse, diff_hi, diff_hi);
+    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  {
+    // Note: 'total''s pairwise addition could be implemented similarly to
+    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // with the summation of 'sse' performed better on a Cortex-A15.
+    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
+    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    const int32x2_t t2 = vpadd_s32(t1, t1);
+    const int t = vget_lane_s32(t2, 0);
+    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int s = vget_lane_s32(s1, 0);
+    const int shift_factor = bwl + 2;
+    return s - ((t * t) >> shift_factor);
+  }
+}

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index 6c66f5d..11e8773 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c

@@ -9,215 +9,28 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/txfm_common.h"
 
-void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
-  int r;
-  int16x8_t sum = vld1q_s16(&input[0]);
-  for (r = 1; r < 8; ++r) {
-    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
-    sum = vaddq_s16(sum, input_00);
-  }
-  {
-    const int32x4_t a = vpaddlq_s16(sum);
-    const int64x2_t b = vpaddlq_s32(a);
-    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                                 vreinterpret_s32_s64(vget_high_s64(b)));
-    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-    output[1] = 0;
-  }
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr,
+                            int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+                            const int16_t* dequant_ptr, uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  vpx_fdct8x8_neon(input, temp_buffer, stride);
+  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
 }
-
-void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
-  int i;
-  // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
-                                            vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
-                                            vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
-                                            vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
-                                            vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }  // for
-  {
-    // from vp9_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
-    // store results
-    vst1q_s16(&final_output[0 * 8], input_0);
-    vst1q_s16(&final_output[1 * 8], input_1);
-    vst1q_s16(&final_output[2 * 8], input_2);
-    vst1q_s16(&final_output[3 * 8], input_3);
-    vst1q_s16(&final_output[4 * 8], input_4);
-    vst1q_s16(&final_output[5 * 8], input_5);
-    vst1q_s16(&final_output[6 * 8], input_6);
-    vst1q_s16(&final_output[7 * 8], input_7);
-  }
-}
-

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
new file mode 100644
index 0000000..1c75031
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c

@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+
+int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 2d5ec79..47363c7 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -26,20 +26,18 @@
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
                           int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          int zbin_oq_value, uint16_t *eob_ptr,
+                          uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  int i;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)scan;
 
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-
+    int i;
     const int16x8_t v_zero = vdupq_n_s16(0);
     const int16x8_t v_one = vdupq_n_s16(1);
     int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
@@ -50,13 +48,12 @@
     v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
     v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
     v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-
-    for (i = 0; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
       const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
-      const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
       const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
                                            vget_low_s16(v_quant));
       const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
@@ -65,20 +62,39 @@
                                             vshrn_n_s32(v_tmp_hi, 16));
       const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
       const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan =
-          vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
       const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
       const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
       const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-
       v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
       v_round = vmovq_n_s16(round_ptr[1]);
       v_quant = vmovq_n_s16(quant_ptr[1]);
       v_dequant = vmovq_n_s16(dequant_ptr[1]);
     }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
     {
       const int16x4_t v_eobmax_3210 =
           vmax_s16(vget_low_s16(v_eobmax_76543210),
@@ -95,8 +111,8 @@
       *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
     }
   } else {
-    vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-    vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
     *eob_ptr = 0;
   }
 }

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
deleted file mode 100644
index 816fbda..0000000
--- a/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ /dev/null

@@ -1,227 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "./vp9_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kWidth16 = 16 };
-enum { kHeight16 = 16 };
-enum { kHeight16PlusOne = 17 };
-enum { kWidth32 = 32 };
-enum { kHeight32 = 32 };
-enum { kHeight32PlusOne = 33 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static void variance_neon_w8(const uint8_t *a, int a_stride,
-                             const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo = vmlal_s16(v_sse_lo,
-                           vget_low_s16(sv_diff),
-                           vget_low_s16(sv_diff));
-      v_sse_hi = vmlal_s16(v_sse_hi,
-                           vget_high_s16(sv_diff),
-                           vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
-                        const uint8_t *ref_ptr, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
-                   kHeight8, sse, sum);
-}
-
-unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
-}
-
-void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
-                   kHeight16, sse, sum);
-}
-
-unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const int16_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const int16_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
-    }
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
-                                            int src_stride,
-                                            int xoffset,
-                                            int yoffset,
-                                            const uint8_t *dst,
-                                            int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
-
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
-                            kHeight8PlusOne, kWidth8,
-                            BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
-                            kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
-                             kHeight16PlusOne, kWidth16,
-                             BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
-                             kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
-}
-
-void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
-                   kHeight32, sse, sum);
-}
-
-unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
-                             kHeight32PlusOne, kWidth32,
-                             BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
-                             kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
-}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c
new file mode 100644
index 0000000..611adb1
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_avg_msa.c

@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+  uint32_t sum_out;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+  v4u32 sum = { 0 };
+
+  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
+  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
+  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
+  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
+  sum0 += sum4;
+
+  sum = __msa_hadd_u_w(sum0, sum0);
+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
+  sum = __msa_hadd_u_w(sum0, sum0);
+  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
+  sum_out = __msa_copy_u_w((v4i32)sum, 0);
+
+  return sum_out;
+}
+
+uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+  uint32_t sum_out;
+  uint32_t src0, src1, src2, src3;
+  v16u8 vec = { 0 };
+  v8u16 sum0;
+  v4u32 sum1;
+  v2u64 sum2;
+
+  LW4(src, src_stride, src0, src1, src2, src3);
+  INSERT_W4_UB(src0, src1, src2, src3, vec);
+
+  sum0 = __msa_hadd_u_h(vec, vec);
+  sum1 = __msa_hadd_u_w(sum0, sum0);
+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
+  sum1 = __msa_hadd_u_w(sum0, sum0);
+  sum2 = __msa_hadd_u_d(sum1, sum1);
+  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
+  sum_out = __msa_copy_u_w((v4i32)sum1, 0);
+
+  return sum_out;
+}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
new file mode 100644
index 0000000..1dc70bd
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c

@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \
+static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \
+                                             const int16_t *dq_coeff_ptr,  \
+                                             int64_t *ssz) {               \
+  int64_t err = 0;                                                         \
+  uint32_t loop_cnt;                                                       \
+  v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+  v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+  v2i64 sq_coeff_r, sq_coeff_l;                                            \
+  v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                           \
+  coeff = LD_SH(coeff_ptr);                                                \
+  dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w,                  \
+              sq_coeff_r, sq_coeff_l);                                     \
+  DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                           \
+  coeff = LD_SH(coeff_ptr + 8);                                            \
+  dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+  DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                           \
+  coeff_ptr += 16;                                                         \
+  dq_coeff_ptr += 16;                                                      \
+                                                                           \
+  for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+    coeff = LD_SH(coeff_ptr);                                              \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff = LD_SH(coeff_ptr + 8);                                          \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff_ptr += 16;                                                       \
+    dq_coeff_ptr += 16;                                                    \
+  }                                                                        \
+                                                                           \
+  err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+  err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+  sq_coeff_r += err_dup0;                                                  \
+  sq_coeff_l += err_dup1;                                                  \
+  *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+  *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                           \
+  err_dup0 = __msa_splati_d(err0, 1);                                      \
+  err_dup1 = __msa_splati_d(err1, 1);                                      \
+  err0 += err_dup0;                                                        \
+  err1 += err_dup1;                                                        \
+  err = __msa_copy_s_d(err0, 0);                                           \
+  err += __msa_copy_s_d(err1, 0);                                          \
+                                                                           \
+  return err;                                                              \
+}
+
+BLOCK_ERROR_BLOCKSIZE_MSA(16);
+BLOCK_ERROR_BLOCKSIZE_MSA(64);
+BLOCK_ERROR_BLOCKSIZE_MSA(256);
+BLOCK_ERROR_BLOCKSIZE_MSA(1024);
+
+int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr,
+                            intptr_t blk_size, int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16:
+      err = block_error_16size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 64:
+      err = block_error_64size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 256:
+      err = block_error_256size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 1024:
+      err = block_error_1024size_msa(coeff, dq_coeff, ssz);
+      break;
+    default:
+      err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
new file mode 100644
index 0000000..6dabb58
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c

@@ -0,0 +1,507 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+                                   const int32_t *const0, int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r15 = LD_SH(input + 15 * stride);
+  r7 = LD_SH(input + 7 * stride);
+  r8 = LD_SH(input + 8 * stride);
+  SLLI_4V(r0, r15, r7, r8, 2);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 8, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * stride);
+  r4 = LD_SH(input + 4 * stride);
+  r11 = LD_SH(input + 11 * stride);
+  r12 = LD_SH(input + 12 * stride);
+  SLLI_4V(r3, r4, r11, r12, 2);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp2, int_buf, 8);
+  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+  r9 = LD_SH(input + 9 * stride);
+  r6 = LD_SH(input + 6 * stride);
+  r1 = LD_SH(input + stride);
+  r14 = LD_SH(input + 14 * stride);
+  SLLI_4V(r9, r6, r1, r14, 2);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r13 = LD_SH(input + 13 * stride);
+  r2 = LD_SH(input + 2 * stride);
+  r5 = LD_SH(input + 5 * stride);
+  r10 = LD_SH(input + 10 * stride);
+  SLLI_4V(r13, r2, r5, r10, 2);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 128;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+  out += 64;
+
+  /* load input data */
+  input += 128;
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+                                   int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r7 = LD_SH(input + 7 * 8);
+  r8 = LD_SH(input + 8 * 8);
+  r15 = LD_SH(input + 15 * 8);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 4 * 2, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * 8);
+  r4 = LD_SH(input + 4 * 8);
+  r11 = LD_SH(input + 11 * 8);
+  r12 = LD_SH(input + 12 * 8);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp1, int_buf, 4 * 8);
+  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+  r1 = LD_SH(input + 8);
+  r6 = LD_SH(input + 6 * 8);
+  r9 = LD_SH(input + 9 * 8);
+  r14 = LD_SH(input + 14 * 8);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r2 = LD_SH(input + 2 * 8);
+  r5 = LD_SH(input + 5 * 8);
+  r10 = LD_SH(input + 10 * 8);
+  r13 = LD_SH(input + 13 * 8);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 8;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  g13 = LD_SH(int_buf + 3 * 8);
+  g15 = LD_SH(int_buf + 7 * 8);
+  g5 = LD_SH(int_buf + 11 * 8);
+  g7 = LD_SH(int_buf + 15 * 8);
+
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+  out += 16 * 8;
+
+  /* load input data */
+  input += 128;
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+  int16_t *temp = intermediate;
+  int16_t *out = output;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+  v8i16 in12, in13, in14, in15;
+
+  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  temp = intermediate + 8;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(in0, in1);
+  FDCT_POSTPROC_2V_NEG_H(in2, in3);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in6, in7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  temp = intermediate;
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  temp = intermediate;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+               in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  out = output + 8;
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp9_fht16x16_msa(const int16_t *input, int16_t *output,
+                      int32_t stride, int32_t tx_type) {
+  DECLARE_ALIGNED(32, int16_t, tmp[256]);
+  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+  int32_t i;
+  int16_t *ptmpbuf = &tmp_buf[0];
+  int16_t *trans = &trans_buf[0];
+  const int32_t const_arr[29 * 4] = {
+    52707308, 52707308, 52707308, 52707308,
+    -1072430300, -1072430300, -1072430300, -1072430300,
+    795618043, 795618043, 795618043, 795618043,
+    -721080468, -721080468, -721080468, -721080468,
+    459094491, 459094491, 459094491, 459094491,
+    -970646691, -970646691, -970646691, -970646691,
+    1010963856, 1010963856, 1010963856, 1010963856,
+    -361743294, -361743294, -361743294, -361743294,
+    209469125, 209469125, 209469125, 209469125,
+    -1053094788, -1053094788, -1053094788, -1053094788,
+    1053160324, 1053160324, 1053160324, 1053160324,
+    639644520, 639644520, 639644520, 639644520,
+    -862444000, -862444000, -862444000, -862444000,
+    1062144356, 1062144356, 1062144356, 1062144356,
+    -157532337, -157532337, -157532337, -157532337,
+    260914709, 260914709, 260914709, 260914709,
+    -1041559667, -1041559667, -1041559667, -1041559667,
+    920985831, 920985831, 920985831, 920985831,
+    -551995675, -551995675, -551995675, -551995675,
+    596522295, 596522295, 596522295, 596522295,
+    892853362, 892853362, 892853362, 892853362,
+    -892787826, -892787826, -892787826, -892787826,
+    410925857, 410925857, 410925857, 410925857,
+    -992012162, -992012162, -992012162, -992012162,
+    992077698, 992077698, 992077698, 992077698,
+    759246145, 759246145, 759246145, 759246145,
+    -759180609, -759180609, -759180609, -759180609,
+    -759222975, -759222975, -759222975, -759222975,
+    759288511, 759288511, 759288511, 759288511 };
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case ADST_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case DCT_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    case ADST_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
new file mode 100644
index 0000000..574016f
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c

@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
+
+void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 temp, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    temp = __msa_ceqi_h(in0, 0);
+    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+    temp = mask & temp;
+    in0 += temp;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
new file mode 100644
index 0000000..7c3c635
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c

@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
new file mode 100644
index 0000000..d7d40cb
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h

@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_ports/mem.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                   \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {         \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                        \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                         \
+  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,      \
+                     cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };  \
+  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64,    \
+                     cospi_24_64, -cospi_24_64, 0, 0 };                     \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in7, in0,                \
+                        in4, in3);                                          \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+                                                                            \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in5, in2,                \
+                        in6, in1);                                          \
+  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                    \
+  out7 = -s0_m;                                                             \
+  out0 = s1_m;                                                              \
+                                                                            \
+  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);   \
+                                                                            \
+  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);        \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+  cnst1_m = cnst0_m;                                                        \
+                                                                            \
+  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst2_m, cnst3_m, cnst1_m, out1, out6,              \
+                        s0_m, s1_m);                                        \
+                                                                            \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                           \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                  \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                    \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                    \
+                                                                            \
+  out1 = -out1;                                                             \
+  out3 = -out3;                                                             \
+  out5 = -out5;                                                             \
+}
+
+#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                       \
+  v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                       \
+                                                                  \
+  UNPCK_R_SH_SW(in0, in0_r_m);                                    \
+  UNPCK_R_SH_SW(in1, in1_r_m);                                    \
+  UNPCK_R_SH_SW(in2, in2_r_m);                                    \
+  UNPCK_R_SH_SW(in3, in3_r_m);                                    \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_4_9);                           \
+  MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);     \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_1_9);                           \
+  s0_m += in0_r_m * constant_m;                                   \
+  s1_m -= in1_r_m * constant_m;                                   \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_2_9);                           \
+  s0_m += in1_r_m * constant_m;                                   \
+  s1_m += in3_r_m * constant_m;                                   \
+                                                                  \
+  s2_m = in0_r_m + in1_r_m - in3_r_m;                             \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_3_9);                           \
+  MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);     \
+                                                                  \
+  in0_r_m = s0_m + s3_m;                                          \
+  s2_m = s1_m - s3_m;                                             \
+  s3_m = s1_m - s0_m + s3_m;                                      \
+                                                                  \
+  SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);      \
+  PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m,     \
+              s3_m, s3_m, out0, out1, out2, out3);                \
+}
+#endif  /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */

diff --git a/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
new file mode 100644
index 0000000..363aabb
--- /dev/null
+++ b/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c

@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frm2_ptr,
+                                            int32_t filt_sth,
+                                            int32_t filt_wgt,
+                                            uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth,
+                                             int32_t filt_wgt,
+                                             uint32_t *acc,
+                                             uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
+                                    strength, filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
+                                     strength, filt_wgt, accu, cnt);
+  } else {
+    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}

diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.c b/libvpx/vp9/encoder/vp9_aq_complexity.c
index 33f9239..15f227f 100644
--- a/libvpx/vp9/encoder/vp9_aq_complexity.c
+++ b/libvpx/vp9/encoder/vp9_aq_complexity.c

@@ -10,23 +10,37 @@
 
 #include <limits.h>
 #include <math.h>
+#include "vpx_ports/system_state.h"
 
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/common/vp9_seg_common.h"
-
 #include "vp9/encoder/vp9_segmentation.h"
 
-#define AQ_C_SEGMENTS  3
-#define AQ_C_STRENGTHS  3
-static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+#define AQ_C_SEGMENTS  5
+#define DEFAULT_AQ2_SEG 3   // Neutral Q segment
+#define AQ_C_STRENGTHS 3
 static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+  { {1.75, 1.25, 1.05, 1.00, 0.90},
+    {2.00, 1.50, 1.15, 1.00, 0.85},
+    {2.50, 1.75, 1.25, 1.00, 0.80} };
 static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+  { {0.15, 0.30, 0.55, 2.00, 100.0},
+    {0.20, 0.40, 0.65, 2.00, 100.0},
+    {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {-4.0, -3.0, -2.0, 100.00, 100.0},
+    {-3.5, -2.5, -1.5, 100.00, 100.0},
+    {-3.0, -2.0, -1.0, 100.00, 100.0} };
 
-static int get_aq_c_strength(int q_index) {
+#define DEFAULT_COMPLEXITY 64
+
+
+static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
-  int base_quant = vp9_ac_quant(q_index, 0) / 4;
-  return (base_quant > 20) + (base_quant > 45);
+  const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
 }
 
 void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
@@ -34,20 +48,16 @@
   struct segmentation *const seg = &cm->seg;
 
   // Make SURE use of floating point in this function is safe.
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
-    const int active_segments = aq_c_active_segments[aq_strength];
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
 
     // Clear down the segment map.
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-
-    // Clear down the complexity map used for rd.
-    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
 
     vp9_clearall_segfeatures(seg);
 
@@ -63,14 +73,21 @@
     // Select delta coding method.
     seg->abs_delta = SEGMENT_DELTADATA;
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
 
     // Use some of the segments for in frame Q adjustment.
-    for (segment = 1; segment < active_segments; ++segment) {
-      int qindex_delta =
-          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     aq_c_q_adj_factor[aq_strength][segment]);
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG)
+        continue;
+
+      qindex_delta =
+        vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                   aq_c_q_adj_factor[aq_strength][segment],
+                                   cm->bit_depth);
+
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -87,55 +104,54 @@
   }
 }
 
-// Select a segment for the current SB64 block.
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
 // The choice of segment for a block depends on the ratio of the projected
-// bits for the block vs a target average.
-// An "aq_strength" value determines how many segments are supported,
-// the set of transition points to use and the extent of the quantizer
-// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
-                                   int mi_row, int mi_col,
-                                   int output_enabled, int projected_rate) {
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
   VP9_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
   const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int complexity_metric = 64;
+  const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
-
+  int i;
   unsigned char segment;
 
-  if (!output_enabled) {
-    segment = 0;
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                             (bw * bh);
-    const int aq_strength = get_aq_c_strength(cm->base_qindex);
-    const int active_segments = aq_c_active_segments[aq_strength];
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
 
-    // The number of segments considered and the transition points used to
-    // select them is determined by the "aq_strength" value.
-    // Currently this loop only supports segments that reduce Q (i.e. where
-    // there is undershoot.
-    // The loop counts down towards segment 0 which is the default segment
-    // with no Q adjustment.
-    segment = active_segments - 1;
-    while (segment > 0) {
-      if (projected_rate <
-          (target_rate * aq_c_transitions[aq_strength][segment])) {
+    vpx_clear_system_state();
+    low_var_thresh = (cpi->oxcf.pass == 2)
+      ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      : DEFAULT_LV_THRESH;
+
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    logvar = vp9_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;    // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate <
+           target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
         break;
       }
-      --segment;
-    }
-
-    if (target_rate > 0) {
-      complexity_metric =
-        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
     }
   }
 
@@ -143,8 +159,6 @@
   for (y = 0; y < ymis; y++) {
     for (x = 0; x < xmis; x++) {
       cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
-      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
-        (unsigned char)complexity_metric;
     }
   }
 }

diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.h b/libvpx/vp9/encoder/vp9_aq_complexity.h
index af031a4..e9acb1c 100644
--- a/libvpx/vp9/encoder/vp9_aq_complexity.h
+++ b/libvpx/vp9/encoder/vp9_aq_complexity.h

@@ -16,12 +16,15 @@
 extern "C" {
 #endif
 
+#include "vp9/common/vp9_enums.h"
+
 struct VP9_COMP;
+struct macroblock;
 
-// Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
-                                   int output_enabled, int projected_rate);
-
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate);
 
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.

diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index e7f0daa..e6b3686 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -11,6 +11,8 @@
 #include <limits.h>
 #include <math.h>
 
+#include "vpx_ports/system_state.h"
+
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 
 #include "vp9/common/vp9_seg_common.h"
@@ -19,34 +21,45 @@
 #include "vp9/encoder/vp9_segmentation.h"
 
 struct CYCLIC_REFRESH {
-  // Percentage of super-blocks per frame that are targeted as candidates
+  // Percentage of blocks per frame that are targeted as candidates
   // for cyclic refresh.
-  int max_sbs_perframe;
+  int percent_refresh;
   // Maximum q-delta as percentage of base q.
   int max_qdelta_perc;
-  // Block size below which we don't apply cyclic refresh.
-  BLOCK_SIZE min_block_size;
   // Superblock starting index for cycling through the frame.
   int sb_index;
-  // Controls how long a block will need to wait to be refreshed again.
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
   int time_for_refresh;
-  // Actual number of (8x8) blocks that were applied delta-q (segment 1).
-  int num_seg_blocks;
-  // Actual encoding bits for segment 1.
-  int actual_seg_bits;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
   // RD mult. parameters for segment 1.
   int rdmult;
   // Cyclic refresh map.
   signed char *map;
-  // Projected rate and distortion for the current superblock.
-  int64_t projected_rate_sb;
-  int64_t projected_dist_sb;
-  // Thresholds applied to projected rate/distortion of the superblock.
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
   int64_t thresh_rate_sb;
   int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
 };
 
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
   if (cr == NULL)
     return NULL;
@@ -56,12 +69,21 @@
     vpx_free(cr);
     return NULL;
   }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
 
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   vpx_free(cr->map);
+  vpx_free(cr->last_coded_q_map);
   vpx_free(cr);
 }
 
@@ -73,10 +95,10 @@
   // with number of seg blocks, so compare available bits to number of blocks.
   // Average bits available per frame = avg_frame_bandwidth
   // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  const float factor  = 0.5;
+  const float factor = 0.25;
   const int number_blocks = cm->mi_rows  * cm->mi_cols;
   // The condition below corresponds to turning off at target bitrates:
-  // ~24kbps for CIF, 72kbps for VGA (at 30fps).
+  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
   // Also turn off at very small frame sizes, to avoid too large fraction of
   // superblocks to be refreshed per frame. Threshold below is less than QCIF.
   if (rc->avg_frame_bandwidth < factor * number_blocks ||
@@ -92,33 +114,99 @@
 // mode, and rate/distortion.
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
                                 const MB_MODE_INFO *mbmi,
-                                BLOCK_SIZE bsize, int use_rd) {
-  if (use_rd) {
-    // If projected rate is below the thresh_rate (well below target,
-    // so undershoot expected), accept it for lower-qp coding.
-    if (cr->projected_rate_sb < cr->thresh_rate_sb)
-      return 1;
-    // Otherwise, reject the block for lower-qp coding if any of the following:
-    // 1) prediction block size is below min_block_size
-    // 2) mode is non-zero mv and projected distortion is above thresh_dist
-    // 3) mode is an intra-mode (we may want to allow some of this under
-    // another thresh_dist)
-    else if (bsize < cr->min_block_size ||
-             (mbmi->mv[0].as_int != 0 &&
-              cr->projected_dist_sb > cr->thresh_dist_sb) ||
-             !is_inter_block(mbmi))
-      return 0;
-    else
-      return 1;
-  } else {
-    // Rate/distortion not used for update.
-    if (bsize < cr->min_block_size ||
-        mbmi->mv[0].as_int != 0 ||
-        !is_inter_block(mbmi))
-      return 0;
-    else
-      return 1;
+                                int64_t rate,
+                                int64_t dist,
+                                int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else  if (bsize >= BLOCK_16X16 &&
+            rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) &&
+            mbmi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type,
+                                          q, rate_factor,
+                                          cpi->common.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
   }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
+                                          double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) *
+      vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment1 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[1], mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment2 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[2], mbs,
+                             correction_factor, cm->bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
+                                      double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment = (double)((cr->target_num_seg_blocks +
+      cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb = (int)((1.0 - weight_segment) *
+      vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
+      weight_segment *
+      vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor,
+                         cm->bit_depth));
+  return bits_per_mb;
 }
 
 // Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
@@ -127,7 +215,10 @@
 void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
                                        int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize, int use_rd) {
+                                       BLOCK_SIZE bsize,
+                                       int64_t rate,
+                                       int64_t dist,
+                                       int skip) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -135,21 +226,26 @@
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = cpi->mb.in_static_area ||
-                                 candidate_refresh_aq(cr, mbmi, bsize, use_rd);
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
+                                                      bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
-  // Check if we should reset the segment_id for this block.
-  if (mbmi->segment_id > 0 && !refresh_this_block)
-    mbmi->segment_id = 0;
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip)
+      mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (mbmi->segment_id == 1) {
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -161,18 +257,227 @@
     // Leave it marked as block that is not candidate for refresh.
     new_map_value = 1;
   }
+
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++) {
-      cr->map[block_index + y * cm->mi_cols + x] = new_map_value;
-      cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
-          mbmi->segment_id;
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if (!is_inter_block(mbmi) || !skip)
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
     }
-  // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
-  // for encoding this frame.
-  if (mbmi->segment_id)
-    cr->num_seg_blocks += xmis * ymis;
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+  // period. Depending on past encoding stats, GF flag may be reset and update
+  // may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.col : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0)
+          cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1)
+        low_content_frame++;
+    }
+    mi += 8;
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+    vp9_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low =
+      (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * MI_BLOCK_SIZE;
+    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int qindex_thresh =
+        cpi->oxcf.content == VP9E_CONTENT_SCREEN
+            ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+            : 0;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all 8x8 blocks in superblock and update map.
+    xmis = MIN(cm->mi_cols - mi_col,
+               num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis = MIN(cm->mi_rows - mi_row,
+               num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than ZEROMV.
+        if (cr->map[bl_index2] == 0) {
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh)
+            sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  // Account for larger interval on base layer for temporal layers.
+  if (cr->percent_refresh > 0 &&
+      rc->frames_since_key <  (4 * cpi->svc.number_temporal_layers) *
+      (100 / cr->percent_refresh))
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 &&
+      cm->height <= 288 &&
+      rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 10;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 17;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -181,47 +486,39 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  unsigned char *const seg_map = cpi->segmentation_map;
   const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
+  if (cm->current_video_frame == 0)
+    cr->low_content_avg = 0.0;
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh ||
       (cm->frame_type == KEY_FRAME) ||
-      (cpi->svc.temporal_layer_id > 0)) {
+      (cpi->svc.temporal_layer_id > 0) ||
+      (cpi->svc.spatial_layer_id > 0)) {
     // Set segmentation map to 0 and disable.
-    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
+    }
     return;
   } else {
     int qindex_delta = 0;
-    int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
-    int xmis, ymis, x, y, qindex2;
-
-    // Rate target ratio to set q delta.
-    const float rate_ratio_qdelta = 2.0;
-    const double q = vp9_convert_qindex_to_q(cm->base_qindex);
-    vp9_clear_system_state();
-    // Some of these parameters may be set via codec-control function later.
-    cr->max_sbs_perframe = 10;
-    cr->max_qdelta_perc = 50;
-    cr->min_block_size = BLOCK_8X8;
-    cr->time_for_refresh = 1;
-    // Set rate threshold to some fraction of target (and scaled by 256).
-    cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
+    int qindex2;
+    const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    vpx_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
     // Distortion threshold, quadratic in Q, scale factor to be adjusted.
-    cr->thresh_dist_sb = 8 * (int)(q * q);
-    if (cpi->sf.use_nonrd_pick_mode) {
-      // May want to be more conservative with thresholds in non-rd mode for now
-      // as rate/distortion are derived from model based on prediction residual.
-      cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
-      cr->thresh_dist_sb = 4 * (int)(q * q);
-    }
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
 
-    cr->num_seg_blocks = 0;
     // Set up segmentation.
     // Clear down the segment map.
-    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_enable_segmentation(&cm->seg);
     vp9_clearall_segfeatures(seg);
     // Select delta coding method.
@@ -234,90 +531,44 @@
     // relative to 0 previous map.
     // seg->temporal_update = 0;
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
-    // Use segment 1 for in-frame Q adjustment.
-    vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
 
-    // Set the q delta for segment 1.
-    qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
-                                              cm->base_qindex,
-                                              rate_ratio_qdelta);
-    // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
-    // previous encoded frame.
-    if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
-      qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
 
-    // Compute rd-mult for segment 1.
+    // Compute rd-mult for segment BOOST1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
     cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
 
-    vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
-    sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-    sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-    sbs_in_frame = sb_cols * sb_rows;
-    // Number of target superblocks to get the q delta (segment 1).
-    block_count = cr->max_sbs_perframe * sbs_in_frame / 100;
-    // Set the segmentation map: cycle through the superblocks, starting at
-    // cr->mb_index, and stopping when either block_count blocks have been found
-    // to be refreshed, or we have passed through whole frame.
-    assert(cr->sb_index < sbs_in_frame);
-    i = cr->sb_index;
-    do {
-      int sum_map = 0;
-      // Get the mi_row/mi_col corresponding to superblock index i.
-      int sb_row_index = (i / sb_cols);
-      int sb_col_index = i - sb_row_index * sb_cols;
-      int mi_row = sb_row_index * MI_BLOCK_SIZE;
-      int mi_col = sb_col_index * MI_BLOCK_SIZE;
-      assert(mi_row >= 0 && mi_row < cm->mi_rows);
-      assert(mi_col >= 0 && mi_col < cm->mi_cols);
-      bl_index = mi_row * cm->mi_cols + mi_col;
-      // Loop through all 8x8 blocks in superblock and update map.
-      xmis = MIN(cm->mi_cols - mi_col,
-                 num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-      ymis = MIN(cm->mi_rows - mi_row,
-                 num_8x8_blocks_high_lookup[BLOCK_64X64]);
-      for (y = 0; y < ymis; y++) {
-        for (x = 0; x < xmis; x++) {
-          const int bl_index2 = bl_index + y * cm->mi_cols + x;
-          // If the block is as a candidate for clean up then mark it
-          // for possible boost/refresh (segment 1). The segment id may get
-          // reset to 0 later if block gets coded anything other than ZEROMV.
-          if (cr->map[bl_index2] == 0) {
-            seg_map[bl_index2] = 1;
-            sum_map++;
-          } else if (cr->map[bl_index2] < 0) {
-            cr->map[bl_index2]++;
-          }
-        }
-      }
-      // Enforce constant segment over superblock.
-      // If segment is partial over superblock, reset to either all 1 or 0.
-      if (sum_map > 0 && sum_map < xmis * ymis) {
-        const int new_value = (sum_map >= xmis * ymis / 2);
-        for (y = 0; y < ymis; y++)
-          for (x = 0; x < xmis; x++)
-            seg_map[bl_index + y * cm->mi_cols + x] = new_value;
-      }
-      i++;
-      if (i == sbs_in_frame) {
-        i = 0;
-      }
-      if (sum_map >= xmis * ymis /2)
-        block_count--;
-    } while (block_count && i != cr->sb_index);
-    cr->sb_index = i;
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex, MIN(CR_MAX_RATE_TARGET_RATIO,
+        0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
   }
 }
 
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
-                                             int64_t rate_sb, int64_t dist_sb) {
-  cr->projected_rate_sb = rate_sb;
-  cr->projected_dist_sb = dist_sb;
-}
-
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
   return cr->rdmult;
 }
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+}

diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index f556d65..29d2a91 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h

@@ -18,6 +18,15 @@
 extern "C" {
 #endif
 
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE    0
+#define CR_SEGMENT_ID_BOOST1  1
+#define CR_SEGMENT_ID_BOOST2  2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
 struct VP9_COMP;
 
 struct CYCLIC_REFRESH;
@@ -27,22 +36,61 @@
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
 
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
+                                      double correction_factor);
+
 // Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
-                                       int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize, int use_rd);
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
 
 // Setup cyclic background refresh: set delta q and segmentation map.
 void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
 
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
-                                             int64_t rate_sb, int64_t dist_sb);
-
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_aq_variance.c b/libvpx/vp9/encoder/vp9_aq_variance.c
index 56db95e..1c99105 100644
--- a/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/libvpx/vp9/encoder/vp9_aq_variance.c

@@ -10,6 +10,9 @@
 
 #include <math.h>
 
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
 #include "vp9/encoder/vp9_aq_variance.h"
 
 #include "vp9/common/vp9_seg_common.h"
@@ -17,67 +20,32 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-1)
+#define ENERGY_MIN (-4)
 #define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
-static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+static const double rate_ratio[MAX_SEGMENTS] =
+  {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
 
-#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
-#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
+#endif
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
-
   return SEGMENT_ID(energy);
 }
 
-double vp9_vaq_rdmult_ratio(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-
-  vp9_clear_system_state();
-
-  return RDMULT_RATIO(energy);
-}
-
-double vp9_vaq_inv_q_ratio(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-
-  vp9_clear_system_state();
-
-  return Q_RATIO(-energy);
-}
-
-void vp9_vaq_init() {
-  int i;
-  double base_ratio;
-
-  assert(ENERGY_SPAN <= MAX_SEGMENTS);
-
-  vp9_clear_system_state();
-
-  base_ratio = 1.5;
-
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    Q_RATIO(i) = pow(base_ratio, i/3.0);
-  }
-}
-
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
-  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                              cm->y_dc_delta_q);
   int i;
 
   if (cm->frame_type == KEY_FRAME ||
@@ -88,29 +56,87 @@
 
     seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();
+    vpx_clear_system_state();
 
-    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-      int qindex_delta, segment_rdmult;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
 
-      if (Q_RATIO(i) == 1) {
-        // No need to enable SEG_LVL_ALT_Q for this segment
-        RDMULT_RATIO(i) = 1;
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
         continue;
       }
 
-      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i));
-      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
-
-      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                           cm->y_dc_delta_q);
-
-      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     }
   }
 }
 
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int  a_stride,
+                        const uint8_t *b, int  b_stride,
+                        int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void aq_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs) {
@@ -125,24 +151,57 @@
     const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
     const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
-    variance(x->plane[0].src.buf, x->plane[0].src.stride,
-             vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                           CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                           &sse, &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                  vp9_64_zeros, 0, bw, bh, &sse, &avg);
+    }
+#else
+    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
     return (256 * var) / (bw * bh);
   } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),
+                               0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               vp9_64_zeros, 0, &sse);
+    }
+#else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                              x->plane[0].src.stride,
                              vp9_64_zeros, 0, &sse);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return (256 * var) >> num_pels_log2_lookup[bs];
   }
 }
 
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var = block_variance(cpi, x, bs);
+  vpx_clear_system_state();
+  return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   double energy;
-  unsigned int var = block_variance(cpi, x, bs);
-
-  vp9_clear_system_state();
-
-  energy = 0.9 * (log(var + 1.0) - 10.0);
+  double energy_midpoint;
+  vpx_clear_system_state();
+  energy_midpoint =
+    (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }

diff --git a/libvpx/vp9/encoder/vp9_aq_variance.h b/libvpx/vp9/encoder/vp9_aq_variance.h
index d1a459f..a0effa3 100644
--- a/libvpx/vp9/encoder/vp9_aq_variance.h
+++ b/libvpx/vp9/encoder/vp9_aq_variance.h

@@ -19,13 +19,10 @@
 #endif
 
 unsigned int vp9_vaq_segment_id(int energy);
-double vp9_vaq_rdmult_ratio(int energy);
-double vp9_vaq_inv_q_ratio(int energy);
-
-void vp9_vaq_init();
 void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_avg.c b/libvpx/vp9/encoder/vp9_avg.c
new file mode 100644
index 0000000..a9a4c30
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_avg.c

@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_ports/mem.h"
+
+unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+                        int16_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
+                                       // dynamic range [-2040, 2040]
+    coeff += 8;  // coeff: 15 bit
+                 // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
+
+    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64]  = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i)
+    satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return (int16_t)satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64}.
+void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = height >> 1;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
+    for (i = 0; i < height; ++i)
+      hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 510].
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64}.
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 16320]
+  for (idx = 0; idx < width; ++idx)
+    sum += ref[idx];
+  return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4}
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+                     const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+

diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index b0ff0fa..d0de095 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c

@@ -13,8 +13,10 @@
 #include <limits.h>
 
 #include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
+#include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -22,7 +24,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/encoder/vp9_cost.h"
@@ -32,41 +33,38 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
-static struct vp9_token intra_mode_encodings[INTRA_MODES];
-static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
-static struct vp9_token partition_encodings[PARTITION_TYPES];
-static struct vp9_token inter_mode_encodings[INTER_MODES];
+static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
+  {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
+  {62, 6}, {2, 2}};
+static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {2, 2}, {3, 2}};
+static const struct vp9_token partition_encodings[PARTITION_TYPES] =
+  {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+static const struct vp9_token inter_mode_encodings[INTER_MODES] =
+  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
 
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
-}
-
-static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   assert(is_inter_mode(mode));
   vp9_write_token(w, vp9_inter_mode_tree, probs,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
                                 int data, int max) {
-  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
+  vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void prob_diff_update(const vp9_tree_index *tree,
-                             vp9_prob probs[/*n - 1*/],
+static void prob_diff_update(const vpx_tree_index *tree,
+                             vpx_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/],
-                             int n, vp9_writer *w) {
+                             int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -79,57 +77,71 @@
 }
 
 static void write_selected_tx_size(const VP9_COMMON *cm,
-                                   const MACROBLOCKD *xd,
-                                   TX_SIZE tx_size, BLOCK_SIZE bsize,
-                                   vp9_writer *w) {
+                                   const MACROBLOCKD *xd, vpx_writer *w) {
+  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cm->fc.tx_probs);
-  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cm->fc->tx_probs);
+  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
 
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, vp9_writer *w) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vp9_write(w, skip, vp9_get_skip_prob(cm, xd));
+    vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
     return skip;
   }
 }
 
-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
   int k;
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
+                                           FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     prob_diff_update(vp9_switchable_interp_tree,
-                     cm->fc.switchable_interp_prob[j],
-                     cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
+                     cm->fc->switchable_interp_prob[j],
+                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer *w,
-                           TOKENEXTRA **tp, const TOKENEXTRA *const stop) {
+static void pack_mb_tokens(vpx_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp9_token *const a = &vp9_coef_encodings[t];
-    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
     int v = a->value;
     int n = a->len;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const vp9_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp9_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp9_extra_bits_high10[t];
+    else
+      b = &vp9_extra_bits[t];
+#else
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
+    (void) bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -167,12 +179,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(w, bb, pb[i >> 1]);
+          vpx_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(w, e & 1);
+      vpx_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -180,7 +192,7 @@
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
     vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
@@ -188,48 +200,49 @@
 
 // This function encodes the reference frame
 static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                             vp9_writer *w) {
+                             vpx_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
 
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     assert(!is_compound);
     assert(mbmi->ref_frame[0] ==
-               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+               get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
   } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
+      vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
       assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
+      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+        vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
       }
     }
   }
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
-                                vp9_writer *w) {
+                                vpx_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  const nmv_context *nmvc = &cm->fc.nmvc;
-  const MACROBLOCK *const x = &cpi->mb;
+  const nmv_context *nmvc = &cm->fc->nmvc;
+  const MACROBLOCK *const x = &cpi->td.mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
   const int segment_id = mbmi->segment_id;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -241,8 +254,8 @@
   if (seg->update_map) {
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
-      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-      vp9_write(w, pred_flag, pred_prob);
+      vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+      vpx_write(w, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(w, seg, segment_id);
     } else {
@@ -252,18 +265,17 @@
 
   skip = write_skip(cm, xd, segment_id, mi, w);
 
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+    vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(is_inter &&
-        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+      !(is_inter && skip)) {
+    write_selected_tx_size(cm, xd, w);
   }
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -271,29 +283,29 @@
       for (idy = 0; idy < 2; idy += num_4x4_h) {
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+          write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+    write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
-    const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
-    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+    const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
-    if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(w, mode, inter_probs);
-        ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
       }
     }
 
     if (cm->interp_filter == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       vp9_write_token(w, vp9_switchable_interp_tree,
-                      cm->fc.switchable_interp_prob[ctx],
+                      cm->fc->switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mbmi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
     } else {
       assert(mbmi->interp_filter == cm->interp_filter);
     }
@@ -307,11 +319,10 @@
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
           write_inter_mode(w, b_mode, inter_probs);
-          ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
           if (b_mode == NEWMV) {
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-                            &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
                             nmvc, allow_hp);
           }
         }
@@ -320,7 +331,7 @@
       if (mode == NEWMV) {
         for (ref = 0; ref < 1 + is_compound; ++ref)
           vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
-                        &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
+                        &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
                         allow_hp);
       }
     }
@@ -328,11 +339,11 @@
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vp9_writer *w) {
+                              MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
   const MODE_INFO *const mi = mi_8x8[0];
-  const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride];
-  const MODE_INFO *const left_mi = xd->left_available ? mi_8x8[-1] : NULL;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
 
@@ -342,7 +353,7 @@
   write_skip(cm, xd, mbmi->segment_id, mi, w);
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+    write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8) {
     write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
@@ -364,16 +375,19 @@
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vp9_writer *w, TOKENEXTRA **tok,
+                          vpx_writer *w, TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
+  cpi->td.mb.mbmi_ext = cpi->td.mb.mbmi_ext_base +
+      (mi_row * cm->mi_cols + mi_col);
+
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
@@ -385,15 +399,15 @@
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(w, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
 }
 
 static void write_partition(const VP9_COMMON *const cm,
                             const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
-                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const vpx_prob *const probs = xd->partition_probs[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
@@ -401,31 +415,33 @@
     vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
     assert(p == PARTITION_SPLIT);
   }
 }
 
 static void write_modes_sb(VP9_COMP *cpi,
-                           const TileInfo *const tile, vp9_writer *w,
+                           const TileInfo *const tile, vpx_writer *w,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  const int bsl = b_width_log2(bsize);
+  const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  const MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  const MODE_INFO *m = NULL;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
   partition = partition_lookup[bsl][m->mbmi.sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
@@ -467,13 +483,17 @@
 }
 
 static void write_modes(VP9_COMP *cpi,
-                        const TileInfo *const tile, vp9_writer *w,
+                        const TileInfo *const tile, vpx_writer *w,
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
 
+  set_partition_probs(cm, xd);
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp9_zero(cpi->mb.e_mbd.left_seg_context);
+    vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
       write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
@@ -484,7 +504,7 @@
 static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
                                     vp9_coeff_stats *coef_branch_ct,
                                     vp9_coeff_probs_model *coef_probs) {
-  vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
+  vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
   int i, j, k, l, m;
@@ -508,14 +528,16 @@
   }
 }
 
-static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
+static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp9_coeff_stats *frame_branch_ct,
                                      vp9_coeff_probs_model *new_coef_probs) {
-  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
-  const vp9_prob upd = DIFF_UPDATE_PROB;
+  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const vpx_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
@@ -526,14 +548,14 @@
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                const vp9_prob oldp = old_coef_probs[i][j][k][l][t];
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -553,32 +575,32 @@
       // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vp9_write_bit(bc, 0);
+        vpx_write_bit(bc, 0);
         return;
       }
-      vp9_write_bit(bc, 1);
+      vpx_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = DIFF_UPDATE_PROB;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
                       *oldp, &newp, upd);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -592,14 +614,7 @@
       return;
     }
 
-    case ONE_LOOP:
     case ONE_LOOP_REDUCED: {
-      const int prev_coef_contexts_to_update =
-          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
-              COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS;
-      const int coef_band_to_update =
-          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
-              COEF_BANDS >> 1 : COEF_BANDS;
       int updates = 0;
       int noupdates_before_first = 0;
       for (i = 0; i < PLANE_TYPES; ++i) {
@@ -608,25 +623,23 @@
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
-                if (l >= prev_coef_contexts_to_update ||
-                    k >= coef_band_to_update) {
-                  u = 0;
+
+                if (t == PIVOT_NODE) {
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 } else {
-                  if (t == PIVOT_NODE)
-                    s = vp9_prob_diff_update_savings_search_model(
-                        frame_branch_ct[i][j][k][l][0],
-                        old_coef_probs[i][j][k][l], &newp, upd);
-                  else
-                    s = vp9_prob_diff_update_savings_search(
-                        frame_branch_ct[i][j][k][l][t],
-                        *oldp, &newp, upd);
-                  if (s > 0 && newp != *oldp)
-                    u = 1;
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
                 }
+
+                if (s > 0 && newp != *oldp)
+                  u = 1;
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
@@ -635,11 +648,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vp9_write_bit(bc, 1);
+                  vpx_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vp9_write(bc, 0, upd);
+                    vpx_write(bc, 0, upd);
                 }
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -651,104 +664,104 @@
         }
       }
       if (updates == 0) {
-        vp9_write_bit(bc, 0);  // no updates
+        vpx_write_bit(bc, 0);  // no updates
       }
       return;
     }
-
     default:
       assert(0);
   }
 }
 
-static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
+static void update_coef_probs(VP9_COMP *cpi, vpx_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
-  vp9_clear_system_state();
-
-  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
-    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
-                            frame_coef_probs[tx_size]);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
-                             frame_coef_probs[tx_size]);
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+    vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+    vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+      vpx_write_bit(w, 0);
+    } else {
+      build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                              frame_coef_probs);
+      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                               frame_coef_probs);
+    }
+  }
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
-                              struct vp9_write_bit_buffer *wb) {
+                              struct vpx_write_bit_buffer *wb) {
   int i;
 
   // Encode the loop filter level and type
-  vp9_wb_write_literal(wb, lf->filter_level, 6);
-  vp9_wb_write_literal(wb, lf->sharpness_level, 3);
+  vpx_wb_write_literal(wb, lf->filter_level, 6);
+  vpx_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
-  vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+  vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
   if (lf->mode_ref_delta_enabled) {
-    vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
+    vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
     if (lf->mode_ref_delta_update) {
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
         const int delta = lf->ref_deltas[i];
         const int changed = delta != lf->last_ref_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_ref_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
         const int changed = delta != lf->last_mode_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_mode_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
     }
   }
 }
 
-static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
-    vp9_wb_write_bit(wb, 1);
-    vp9_wb_write_literal(wb, abs(delta_q), 4);
-    vp9_wb_write_bit(wb, delta_q < 0);
+    vpx_wb_write_bit(wb, 1);
+    vpx_wb_write_literal(wb, abs(delta_q), 4);
+    vpx_wb_write_bit(wb, delta_q < 0);
   } else {
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
   }
 }
 
 static void encode_quantization(const VP9_COMMON *const cm,
-                                struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
   write_delta_q(wb, cm->uv_dc_delta_q);
   write_delta_q(wb, cm->uv_ac_delta_q);
 }
 
 static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   int i, j;
 
   const struct segmentation *seg = &cm->seg;
 
-  vp9_wb_write_bit(wb, seg->enabled);
+  vpx_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
     return;
 
   // Segmentation map
-  vp9_wb_write_bit(wb, seg->update_map);
+  vpx_wb_write_bit(wb, seg->update_map);
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cm, xd);
@@ -756,40 +769,40 @@
     for (i = 0; i < SEG_TREE_PROBS; i++) {
       const int prob = seg->tree_probs[i];
       const int update = prob != MAX_PROB;
-      vp9_wb_write_bit(wb, update);
+      vpx_wb_write_bit(wb, update);
       if (update)
-        vp9_wb_write_literal(wb, prob, 8);
+        vpx_wb_write_literal(wb, prob, 8);
     }
 
     // Write out the chosen coding method.
-    vp9_wb_write_bit(wb, seg->temporal_update);
+    vpx_wb_write_bit(wb, seg->temporal_update);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++) {
         const int prob = seg->pred_probs[i];
         const int update = prob != MAX_PROB;
-        vp9_wb_write_bit(wb, update);
+        vpx_wb_write_bit(wb, update);
         if (update)
-          vp9_wb_write_literal(wb, prob, 8);
+          vpx_wb_write_literal(wb, prob, 8);
       }
     }
   }
 
   // Segmentation data
-  vp9_wb_write_bit(wb, seg->update_data);
+  vpx_wb_write_bit(wb, seg->update_data);
   if (seg->update_data) {
-    vp9_wb_write_bit(wb, seg->abs_delta);
+    vpx_wb_write_bit(wb, seg->abs_delta);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
-        const int active = vp9_segfeature_active(seg, i, j);
-        vp9_wb_write_bit(wb, active);
+        const int active = segfeature_active(seg, i, j);
+        vpx_wb_write_bit(wb, active);
         if (active) {
-          const int data = vp9_get_segdata(seg, i, j);
+          const int data = get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
             encode_unsigned_max(wb, abs(data), data_max);
-            vp9_wb_write_bit(wb, data < 0);
+            vpx_wb_write_bit(wb, data < 0);
           } else {
             encode_unsigned_max(wb, data, data_max);
           }
@@ -799,11 +812,12 @@
   }
 }
 
-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
   // Mode
-  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  vpx_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
-    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
 
   // Probabilities
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -814,37 +828,37 @@
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
                                   ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
                                   ct_32x32p[j]);
     }
   }
 }
 
 static void write_interp_filter(INTERP_FILTER filter,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   const int filter_to_literal[] = { 1, 0, 2, 3 };
 
-  vp9_wb_write_bit(wb, filter == SWITCHABLE);
+  vpx_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
-    vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
+    vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
 }
 
-static void fix_interp_filter(VP9_COMMON *cm) {
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
   if (cm->interp_filter == SWITCHABLE) {
     // Check to see if only one of the filters is actually used
     int count[SWITCHABLE_FILTERS];
@@ -852,7 +866,7 @@
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
       for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += cm->counts.switchable_interp[j][i];
+        count[i] += counts->switchable_interp[j][i];
       c += (count[i] > 0);
     }
     if (c == 1) {
@@ -868,22 +882,22 @@
 }
 
 static void write_tile_info(const VP9_COMMON *const cm,
-                            struct vp9_write_bit_buffer *wb) {
+                            struct vpx_write_bit_buffer *wb) {
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   ones = cm->log2_tile_cols - min_log2_tile_cols;
   while (ones--)
-    vp9_wb_write_bit(wb, 1);
+    vpx_wb_write_bit(wb, 1);
 
   if (cm->log2_tile_cols < max_log2_tile_cols)
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
 
   // rows
-  vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0)
-    vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
+    vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
 static int get_refresh_mask(VP9_COMP *cpi) {
@@ -914,43 +928,33 @@
 
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_writer residual_bc;
-
+  vpx_writer residual_bc;
   int tile_row, tile_col;
-  TOKENEXTRA *tok[4][1 << 6], *tok_end;
+  TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
-  vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
-             mi_cols_aligned_to_sb(cm->mi_cols));
-
-  tok[0][0] = cpi->tok;
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    if (tile_row)
-      tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] +
-                         cpi->tok_count[tile_row - 1][tile_cols - 1];
-
-    for (tile_col = 1; tile_col < tile_cols; tile_col++)
-      tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
-                                cpi->tok_count[tile_row][tile_col - 1];
-  }
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile;
+      int tile_idx = tile_row * tile_cols + tile_col;
+      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
-      tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+      tok_end = cpi->tile_tok[tile_row][tile_col] +
+          cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
-        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
       else
-        vp9_start_encode(&residual_bc, data_ptr + total_size);
+        vpx_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
-      assert(tok[tile_row][tile_col] == tok_end);
-      vp9_stop_encode(&residual_bc);
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+                  &residual_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
         mem_put_be32(data_ptr + total_size, residual_bc.pos);
@@ -965,75 +969,82 @@
 }
 
 static void write_display_size(const VP9_COMMON *cm,
-                               struct vp9_write_bit_buffer *wb) {
+                               struct vpx_write_bit_buffer *wb) {
   const int scaling_active = cm->width != cm->display_width ||
                              cm->height != cm->display_height;
-  vp9_wb_write_bit(wb, scaling_active);
+  vpx_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
-    vp9_wb_write_literal(wb, cm->display_width - 1, 16);
-    vp9_wb_write_literal(wb, cm->display_height - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_height - 1, 16);
   }
 }
 
 static void write_frame_size(const VP9_COMMON *cm,
-                             struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->width - 1, 16);
-  vp9_wb_write_literal(wb, cm->height - 1, 16);
+                             struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->width - 1, 16);
+  vpx_wb_write_literal(wb, cm->height - 1, 16);
 
   write_display_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(VP9_COMP *cpi,
-                                       struct vp9_write_bit_buffer *wb) {
+                                       struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
   int found = 0;
 
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
-    found = cm->width == cfg->y_crop_width &&
-            cm->height == cfg->y_crop_height;
 
     // Set "found" to 0 for temporal svc and for spatial svc key frame
     if (cpi->use_svc &&
-        (cpi->svc.number_spatial_layers == 1 ||
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
+        ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->oxcf.rc_mode == VPX_CBR) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
+        (is_two_pass_svc(cpi) &&
+         cpi->svc.encode_empty_frame_state == ENCODING &&
+         cpi->svc.layer_context[0].frames_from_key_frame <
+         cpi->svc.number_temporal_layers + 1))) {
       found = 0;
+    } else if (cfg != NULL) {
+      found = cm->width == cfg->y_crop_width &&
+              cm->height == cfg->y_crop_height;
     }
-    vp9_wb_write_bit(wb, found);
+    vpx_wb_write_bit(wb, found);
     if (found) {
       break;
     }
   }
 
   if (!found) {
-    vp9_wb_write_literal(wb, cm->width - 1, 16);
-    vp9_wb_write_literal(wb, cm->height - 1, 16);
+    vpx_wb_write_literal(wb, cm->width - 1, 16);
+    vpx_wb_write_literal(wb, cm->height - 1, 16);
   }
 
   write_display_size(cm, wb);
 }
 
-static void write_sync_code(struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
 static void write_profile(BITSTREAM_PROFILE profile,
-                          struct vp9_write_bit_buffer *wb) {
+                          struct vpx_write_bit_buffer *wb) {
   switch (profile) {
     case PROFILE_0:
-      vp9_wb_write_literal(wb, 0, 2);
+      vpx_wb_write_literal(wb, 0, 2);
       break;
     case PROFILE_1:
-      vp9_wb_write_literal(wb, 2, 2);
+      vpx_wb_write_literal(wb, 2, 2);
       break;
     case PROFILE_2:
-      vp9_wb_write_literal(wb, 1, 2);
+      vpx_wb_write_literal(wb, 1, 2);
       break;
     case PROFILE_3:
-      vp9_wb_write_literal(wb, 6, 3);
+      vpx_wb_write_literal(wb, 6, 3);
       break;
     default:
       assert(0);
@@ -1041,51 +1052,60 @@
 }
 
 static void write_bitdepth_colorspace_sampling(
-    VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
+    VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
   if (cm->profile >= PROFILE_2) {
-    assert(cm->bit_depth > BITS_8);
-    vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+    assert(cm->bit_depth > VPX_BITS_8);
+    vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
-  vp9_wb_write_literal(wb, cm->color_space, 3);
-  if (cm->color_space != SRGB) {
-    vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+  vpx_wb_write_literal(wb, cm->color_space, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
-      vp9_wb_write_bit(wb, cm->subsampling_x);
-      vp9_wb_write_bit(wb, cm->subsampling_y);
-      vp9_wb_write_bit(wb, 0);  // unused
+      vpx_wb_write_bit(wb, cm->subsampling_x);
+      vpx_wb_write_bit(wb, cm->subsampling_y);
+      vpx_wb_write_bit(wb, 0);  // unused
     } else {
       assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
     }
   } else {
     assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
-    vp9_wb_write_bit(wb, 0);  // unused
+    vpx_wb_write_bit(wb, 0);  // unused
   }
 }
 
 static void write_uncompressed_header(VP9_COMP *cpi,
-                                      struct vp9_write_bit_buffer *wb) {
+                                      struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+  vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
   write_profile(cm->profile, wb);
 
-  vp9_wb_write_bit(wb, 0);  // show_existing_frame
-  vp9_wb_write_bit(wb, cm->frame_type);
-  vp9_wb_write_bit(wb, cm->show_frame);
-  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  vpx_wb_write_bit(wb, cm->frame_type);
+  vpx_wb_write_bit(wb, cm->show_frame);
+  vpx_wb_write_bit(wb, cm->error_resilient_mode);
 
   if (cm->frame_type == KEY_FRAME) {
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
   } else {
+    // In spatial svc if it's not error_resilient_mode then we need to code all
+    // visible frames as invisible. But we need to keep the show_frame flag so
+    // that the publisher could know whether it is supposed to be visible.
+    // So we will code the show_frame flag as it is. Then code the intra_only
+    // bit here. This will make the bitstream incompatible. In the player we
+    // will change to show_frame flag to 0, then add an one byte frame with
+    // show_existing_frame flag which tells the decoder which frame we want to
+    // show.
     if (!cm->show_frame)
-      vp9_wb_write_bit(wb, cm->intra_only);
+      vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
-      vp9_wb_write_literal(wb, cm->reset_frame_context, 2);
+      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
 
     if (cm->intra_only) {
       write_sync_code(wb);
@@ -1095,112 +1115,113 @@
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame),
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
-        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
       write_frame_size_with_refs(cpi, wb);
 
-      vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
+      vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
 
-      fix_interp_filter(cm);
+      fix_interp_filter(cm, cpi->td.counts);
       write_interp_filter(cm->interp_filter, wb);
     }
   }
 
   if (!cm->error_resilient_mode) {
-    vp9_wb_write_bit(wb, cm->refresh_frame_context);
-    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+    vpx_wb_write_bit(wb, cm->refresh_frame_context);
+    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
-  encode_segmentation(cm, &cpi->mb.e_mbd, wb);
+  encode_segmentation(cm, xd, wb);
 
   write_tile_info(cm, wb);
 }
 
 static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  FRAME_CONTEXT *const fc = &cm->fc;
-  vp9_writer header_bc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
+  vpx_writer header_bc;
 
-  vp9_start_encode(&header_bc, data);
+  vpx_start_encode(&header_bc, data);
 
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
   else
-    encode_txfm_probs(cm, &header_bc);
+    encode_txfm_probs(cm, &header_bc, counts);
 
   update_coef_probs(cpi, &header_bc);
-  update_skip_probs(cm, &header_bc);
+  update_skip_probs(cm, &header_bc, counts);
 
   if (!frame_is_intra_only(cm)) {
     int i;
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
-                       cm->counts.inter_mode[i], INTER_MODES, &header_bc);
-
-    vp9_zero(cm->counts.inter_mode);
+      prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                       counts->inter_mode[i], INTER_MODES, &header_bc);
 
     if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cm, &header_bc);
+      update_switchable_interp_probs(cm, &header_bc, counts);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                cm->counts.intra_inter[i]);
+                                counts->intra_inter[i]);
 
-    if (cm->allow_comp_inter_inter) {
+    if (cpi->allow_comp_inter_inter) {
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
-      vp9_write_bit(&header_bc, use_compound_pred);
+      vpx_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
-        vp9_write_bit(&header_bc, use_hybrid_pred);
+        vpx_write_bit(&header_bc, use_hybrid_pred);
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      cm->counts.comp_inter[i]);
+                                      counts->comp_inter[i]);
       }
     }
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  cm->counts.single_ref[i][0]);
+                                  counts->single_ref[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  cm->counts.single_ref[i][1]);
+                                  counts->single_ref[i][1]);
       }
     }
 
     if (cm->reference_mode != SINGLE_REFERENCE)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  cm->counts.comp_ref[i]);
+                                  counts->comp_ref[i]);
 
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-      prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
-                       cm->counts.y_mode[i], INTRA_MODES, &header_bc);
+      prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
+                       counts->y_mode[i], INTRA_MODES, &header_bc);
 
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
       prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
-                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);
+                       counts->partition[i], PARTITION_TYPES, &header_bc);
 
-    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+                        &counts->mv);
   }
 
-  vp9_stop_encode(&header_bc);
+  vpx_stop_encode(&header_bc);
   assert(header_bc.pos <= 0xffff);
 
   return header_bc.pos;
@@ -1209,22 +1230,22 @@
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
   size_t first_part_size, uncompressed_hdr_size;
-  struct vp9_write_bit_buffer wb = {data, 0};
-  struct vp9_write_bit_buffer saved_wb;
+  struct vpx_write_bit_buffer wb = {data, 0};
+  struct vpx_write_bit_buffer saved_wb;
 
   write_uncompressed_header(cpi, &wb);
   saved_wb = wb;
-  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  uncompressed_hdr_size = vp9_wb_bytes_written(&wb);
+  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
   // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 

diff --git a/libvpx/vp9/encoder/vp9_bitstream.h b/libvpx/vp9/encoder/vp9_bitstream.h
index 8e82d1c..da6b414 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/libvpx/vp9/encoder/vp9_bitstream.h

@@ -18,18 +18,16 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 
-void vp9_entropy_mode_init();
-
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
          cpi->rc.is_src_frame_alt_ref &&
          (!cpi->use_svc ||      // Add spatial svc base layer case here
-          (is_spatial_svc(cpi) &&
+          (is_two_pass_svc(cpi) &&
            cpi->svc.spatial_layer_id == 0 &&
            cpi->svc.layer_context[0].gold_ref_idx >=0 &&
-           cpi->oxcf.ss_play_alternate[0]));
+           cpi->oxcf.ss_enable_auto_arf[0]));
 }
 
 #ifdef __cplusplus

diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index bd3b0fd..fc34786 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h

@@ -13,8 +13,6 @@
 
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,8 +26,8 @@
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  int16_t *qcoeff;
-  int16_t *coeff;
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
   uint16_t *eobs;
   struct buf_2d src;
 
@@ -42,8 +40,6 @@
   int16_t *round;
 
   int64_t quant_thred[2];
-  // Zbin Over Quant value
-  int16_t zbin_extra;
 };
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
@@ -51,11 +47,18 @@
 typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
                                    [COEFF_CONTEXTS][ENTROPY_TOKENS];
 
+typedef struct {
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  uint8_t mode_context[MAX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
 
   MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
   int skip_block;
   int select_tx_size;
   int skip_recode;
@@ -69,6 +72,11 @@
   int rdmult;
   int mb_energy;
 
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
   unsigned int source_variance;
@@ -76,16 +84,12 @@
   int pred_mv_sad[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
   int *nmvcost[2];
-  int nmvcosts_hp[2][MV_VALS];
   int *nmvcost_hp[2];
   int **mvcost;
 
   int nmvjointsadcost[MV_JOINTS];
-  int nmvsadcosts[2][MV_VALS];
   int *nmvsadcost[2];
-  int nmvsadcosts_hp[2][MV_VALS];
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
@@ -96,7 +100,10 @@
   int mv_row_min;
   int mv_row_max;
 
+  // Notes transform blocks where no coefficents are coded.
+  // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
+
   int skip;
 
   int encode_breakout;
@@ -104,8 +111,6 @@
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
 
-  int in_static_area;
-
   int optimize;
 
   // indicate if it is in the rd search loop or encoding process
@@ -116,15 +121,26 @@
   int quant_fp;
 
   // skip forward transform and quantization
-  int skip_txfm[MAX_MB_PLANE];
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  #define SKIP_TXFM_NONE 0
+  #define SKIP_TXFM_AC_DC 1
+  #define SKIP_TXFM_AC_ONLY 2
 
-  int64_t bsse[MAX_MB_PLANE];
+  int64_t bsse[MAX_MB_PLANE << 2];
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
-  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
+  // Strong color activity detection. Used in RTC coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+
+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+#endif
 };
 
 #ifdef __cplusplus

diff --git a/libvpx/vp9/encoder/vp9_blockiness.c b/libvpx/vp9/encoder/vp9_blockiness.c
new file mode 100644
index 0000000..1a89ce4
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_blockiness.c

@@ -0,0 +1,134 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0]*s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1]*s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                          const uint8_t *img2, int img2_pitch,
+                          int width, int height) {
+  double blockiness = 0;
+  int i, j;
+  vpx_clear_system_state();
+  for (i = 0; i < height; i += 4, img1 += img1_pitch * 4,
+       img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness += blockiness_vertical(img1 + j, img1_pitch,
+                                          img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch,
+                                            img2 + j, img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}

diff --git a/libvpx/vp9/encoder/vp9_context_tree.c b/libvpx/vp9/encoder/vp9_context_tree.c
index 9b7a932..e87cccb 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/libvpx/vp9/encoder/vp9_context_tree.c

@@ -26,17 +26,17 @@
   ctx->num_4x4_blk = num_blk;
 
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                  vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+                      vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
@@ -69,10 +69,13 @@
   alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
 
-  /* TODO(Jbb): for 4x8 and 8x4 these allocated values are not used.
-   * Figure out a better way to do this. */
-  alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
-  alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+  if (num_4x4_blk > 4) {
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+  } else {
+    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  }
 }
 
 static void free_tree_contexts(PC_TREE *tree) {
@@ -87,7 +90,7 @@
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
 // represents the state of our search.
-void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int i, j;
   const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
@@ -97,24 +100,24 @@
   int square_index = 1;
   int nodes;
 
-  vpx_free(cpi->leaf_tree);
-  CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
-                                                 sizeof(*cpi->leaf_tree)));
-  vpx_free(cpi->pc_tree);
-  CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
-                                               sizeof(*cpi->pc_tree)));
+  vpx_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+                                                sizeof(*td->leaf_tree)));
+  vpx_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->pc_tree)));
 
-  this_pc = &cpi->pc_tree[0];
-  this_leaf = &cpi->leaf_tree[0];
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];
 
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
   for (i = 0; i < leaf_nodes; ++i)
-    alloc_mode_context(cm, 1, &cpi->leaf_tree[i]);
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-    PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
     alloc_tree_contexts(cm, tree, 4);
     tree->leaf_split[0] = this_leaf++;
@@ -126,7 +129,7 @@
   // from leafs to the root.
   for (nodes = 16; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
-      PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
       tree->block_size = square[square_index];
       for (j = 0; j < 4; j++)
@@ -135,24 +138,24 @@
     }
     ++square_index;
   }
-  cpi->pc_root = &cpi->pc_tree[tree_nodes - 1];
-  cpi->pc_root[0].none.best_mode_index = 2;
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[0].none.best_mode_index = 2;
 }
 
-void vp9_free_pc_tree(VP9_COMP *cpi) {
+void vp9_free_pc_tree(ThreadData *td) {
   const int tree_nodes = 64 + 16 + 4 + 1;
   int i;
 
   // Set up all 4x4 mode contexts
   for (i = 0; i < 64; ++i)
-    free_mode_context(&cpi->leaf_tree[i]);
+    free_mode_context(&td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
   for (i = 0; i < tree_nodes; ++i)
-    free_tree_contexts(&cpi->pc_tree[i]);
+    free_tree_contexts(&td->pc_tree[i]);
 
-  vpx_free(cpi->pc_tree);
-  cpi->pc_tree = NULL;
-  vpx_free(cpi->leaf_tree);
-  cpi->leaf_tree = NULL;
+  vpx_free(td->pc_tree);
+  td->pc_tree = NULL;
+  vpx_free(td->leaf_tree);
+  td->leaf_tree = NULL;
 }

diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h
index d60e6c3..ac24497 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libvpx/vp9/encoder/vp9_context_tree.h

@@ -11,36 +11,48 @@
 #ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
 #define VP9_ENCODER_VP9_CONTEXT_TREE_H_
 
-#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
 
 struct VP9_COMP;
+struct VP9Common;
+struct ThreadData;
 
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
+  MB_MODE_INFO_EXT mbmi_ext;
   uint8_t *zcoeff_blk;
-  int16_t *coeff[MAX_MB_PLANE][3];
-  int16_t *qcoeff[MAX_MB_PLANE][3];
-  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  tran_low_t *coeff[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
   uint16_t *eobs[MAX_MB_PLANE][3];
 
   // dual buffer pointers, 0: in use, 1: best in store
-  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
   uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
 
   int is_coded;
   int num_4x4_blk;
   int skip;
-  int skip_txfm[MAX_MB_PLANE];
+  int pred_pixel_ready;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 0.
+  int skippable;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  int64_t tx_rd_diff[TX_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
   unsigned int zeromv_sse;
@@ -69,7 +81,7 @@
   };
 } PC_TREE;
 
-void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi);
-void vp9_free_pc_tree(struct VP9_COMP *cpi);
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);
 
 #endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */

diff --git a/libvpx/vp9/encoder/vp9_cost.c b/libvpx/vp9/encoder/vp9_cost.c
index 1c3c3d2..e2fbb34 100644
--- a/libvpx/vp9/encoder/vp9_cost.c
+++ b/libvpx/vp9/encoder/vp9_cost.c

@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 
 #include "vp9/encoder/vp9_cost.h"
 
@@ -34,14 +35,14 @@
   22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
   4,    3,    1,    1};
 
-static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
                  int i, int c) {
-  const vp9_prob prob = probs[i / 2];
+  const vpx_prob prob = probs[i / 2];
   int b;
 
   for (b = 0; b <= 1; ++b) {
     const int cc = c + vp9_cost_bit(prob, b);
-    const vp9_tree_index ii = tree[i + b];
+    const vpx_tree_index ii = tree[i + b];
 
     if (ii <= 0)
       costs[-ii] = cc;
@@ -50,11 +51,11 @@
   }
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
   cost(costs, tree, probs, 0, 0);
 }
 
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
   assert(tree[0] <= 0 && tree[1] > 0);
 
   costs[-tree[0]] = vp9_cost_bit(probs[0], 0);

diff --git a/libvpx/vp9/encoder/vp9_cost.h b/libvpx/vp9/encoder/vp9_cost.h
index 6d2b940..eac74c4 100644
--- a/libvpx/vp9/encoder/vp9_cost.h
+++ b/libvpx/vp9/encoder/vp9_cost.h

@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_COST_H_
 #define VP9_ENCODER_VP9_COST_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,20 +21,20 @@
 
 #define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
+#define vp9_cost_one(prob) vp9_cost_zero(vpx_complement(prob))
 
-#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vpx_complement(prob) \
                                                     : (prob))
 
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vp9_prob p) {
+                                          vpx_prob p) {
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs,
                              int bits, int len) {
   int cost = 0;
-  vp9_tree_index i = 0;
+  vpx_tree_index i = 0;
 
   do {
     const int bit = (bits >> --len) & 1;
@@ -45,8 +45,8 @@
   return cost;
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 59222f0..f94540b 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c

@@ -11,22 +11,18 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_ports/mem.h"
 
-static INLINE int fdct_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
-}
-
-static void fdct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
 
   step[0] = input[0] + input[3];
   step[1] = input[1] + input[2];
@@ -35,93 +31,210 @@
 
   temp1 = (step[0] + step[1]) * cospi_16_64;
   temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = fdct_round_shift(temp1);
-  output[2] = fdct_round_shift(temp2);
+  output[0] = (tran_low_t)fdct_round_shift(temp1);
+  output[2] = (tran_low_t)fdct_round_shift(temp2);
   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = fdct_round_shift(temp1);
-  output[3] = fdct_round_shift(temp2);
+  output[1] = (tran_low_t)fdct_round_shift(temp1);
+  output[3] = (tran_low_t)fdct_round_shift(temp2);
 }
 
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
-  int r, c;
-  int16_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c)
-      sum += input[r * stride + c];
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
 
-  output[0] = sum << 1;
-  output[1] = 0;
+  // stage 1
+  s0 = input[0] + input[7];
+  s1 = input[1] + input[6];
+  s2 = input[2] + input[5];
+  s3 = input[3] + input[4];
+  s4 = input[3] - input[4];
+  s5 = input[2] - input[5];
+  s6 = input[1] - input[6];
+  s7 = input[0] - input[7];
+
+  // fdct4(step, step);
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[0] = (tran_low_t)fdct_round_shift(t0);
+  output[2] = (tran_low_t)fdct_round_shift(t2);
+  output[4] = (tran_low_t)fdct_round_shift(t1);
+  output[6] = (tran_low_t)fdct_round_shift(t3);
+
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = (tran_low_t)fdct_round_shift(t0);
+  t3 = (tran_low_t)fdct_round_shift(t1);
+
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
+
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[1] = (tran_low_t)fdct_round_shift(t0);
+  output[3] = (tran_low_t)fdct_round_shift(t2);
+  output[5] = (tran_low_t)fdct_round_shift(t1);
+  output[7] = (tran_low_t)fdct_round_shift(t3);
 }
 
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  int16_t intermediate[4 * 4];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int input[4];
-    /*canbe16*/ int step[4];
-    /*needs32*/ int temp1, temp2;
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (0 == pass) {
-        input[0] = in[0 * stride] * 16;
-        input[1] = in[1 * stride] * 16;
-        input[2] = in[2 * stride] * 16;
-        input[3] = in[3 * stride] * 16;
-        if (i == 0 && input[0]) {
-          input[0] += 1;
-        }
-      } else {
-        input[0] = in[0 * 4];
-        input[1] = in[1 * 4];
-        input[2] = in[2 * 4];
-        input[3] = in[3 * 4];
-      }
-      // Transform.
-      step[0] = input[0] + input[3];
-      step[1] = input[1] + input[2];
-      step[2] = input[1] - input[2];
-      step[3] = input[0] - input[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = fdct_round_shift(temp1);
-      out[2] = fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = fdct_round_shift(temp1);
-      out[3] = fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      out += 4;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t input[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
 
+  // step 1
+  input[0] = in[0] + in[15];
+  input[1] = in[1] + in[14];
+  input[2] = in[2] + in[13];
+  input[3] = in[3] + in[12];
+  input[4] = in[4] + in[11];
+  input[5] = in[5] + in[10];
+  input[6] = in[6] + in[ 9];
+  input[7] = in[7] + in[ 8];
+
+  step1[0] = in[7] - in[ 8];
+  step1[1] = in[6] - in[ 9];
+  step1[2] = in[5] - in[10];
+  step1[3] = in[4] - in[11];
+  step1[4] = in[3] - in[12];
+  step1[5] = in[2] - in[13];
+  step1[6] = in[1] - in[14];
+  step1[7] = in[0] - in[15];
+
+  // fdct8(step, step);
   {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    // stage 1
+    s0 = input[0] + input[7];
+    s1 = input[1] + input[6];
+    s2 = input[2] + input[5];
+    s3 = input[3] + input[4];
+    s4 = input[3] - input[4];
+    s5 = input[2] - input[5];
+    s6 = input[1] - input[6];
+    s7 = input[0] - input[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    out[0] = (tran_low_t)fdct_round_shift(t0);
+    out[4] = (tran_low_t)fdct_round_shift(t2);
+    out[8] = (tran_low_t)fdct_round_shift(t1);
+    out[12] = (tran_low_t)fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    out[2] = (tran_low_t)fdct_round_shift(t0);
+    out[6] = (tran_low_t)fdct_round_shift(t2);
+    out[10] = (tran_low_t)fdct_round_shift(t1);
+    out[14] = (tran_low_t)fdct_round_shift(t3);
   }
+
+  // step 2
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
+
+  // step 3
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
+
+  // step 4
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
+  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
+
+  // step 5
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] + step2[2];
+  step1[3] = step3[3] - step2[2];
+  step1[4] = step3[4] - step2[5];
+  step1[5] = step3[4] + step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
+
+  // step 6
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  out[1] = (tran_low_t)fdct_round_shift(temp1);
+  out[9] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  out[5] = (tran_low_t)fdct_round_shift(temp1);
+  out[13] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  out[3] = (tran_low_t)fdct_round_shift(temp1);
+  out[11] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  out[7] = (tran_low_t)fdct_round_shift(temp1);
+  out[15] = (tran_low_t)fdct_round_shift(temp2);
 }
 
-static void fadst4(const int16_t *input, int16_t *output) {
-  int x0, x1, x2, x3;
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   x0 = input[0];
   x1 = input[1];
@@ -153,386 +266,23 @@
   s3 = x2 - x0 + x3;
 
   // 1-D transform scaling factor is sqrt(2).
-  output[0] = fdct_round_shift(s0);
-  output[1] = fdct_round_shift(s1);
-  output[2] = fdct_round_shift(s2);
-  output[3] = fdct_round_shift(s3);
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
 }
 
-static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
-};
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-void vp9_fht4x4_c(const int16_t *input, int16_t *output,
-                  int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_fdct4x4_c(input, output, stride);
-  } else {
-    int16_t out[4 * 4];
-    int16_t *outptr = &out[0];
-    int i, j;
-    int16_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = input[j * stride + i] * 16;
-      if (i == 0 && temp_in[0])
-        temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
-    }
-  }
-}
-
-static void fdct8(const int16_t *input, int16_t *output) {
-  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-  /*needs32*/ int t0, t1, t2, t3;
-  /*canbe16*/ int x0, x1, x2, x3;
-
-  // stage 1
-  s0 = input[0] + input[7];
-  s1 = input[1] + input[6];
-  s2 = input[2] + input[5];
-  s3 = input[3] + input[4];
-  s4 = input[3] - input[4];
-  s5 = input[2] - input[5];
-  s6 = input[1] - input[6];
-  s7 = input[0] - input[7];
-
-  // fdct4(step, step);
-  x0 = s0 + s3;
-  x1 = s1 + s2;
-  x2 = s1 - s2;
-  x3 = s0 - s3;
-  t0 = (x0 + x1) * cospi_16_64;
-  t1 = (x0 - x1) * cospi_16_64;
-  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = fdct_round_shift(t0);
-  output[2] = fdct_round_shift(t2);
-  output[4] = fdct_round_shift(t1);
-  output[6] = fdct_round_shift(t3);
-
-  // Stage 2
-  t0 = (s6 - s5) * cospi_16_64;
-  t1 = (s6 + s5) * cospi_16_64;
-  t2 = fdct_round_shift(t0);
-  t3 = fdct_round_shift(t1);
-
-  // Stage 3
-  x0 = s4 + t2;
-  x1 = s4 - t2;
-  x2 = s7 - t3;
-  x3 = s7 + t3;
-
-  // Stage 4
-  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = fdct_round_shift(t0);
-  output[3] = fdct_round_shift(t2);
-  output[5] = fdct_round_shift(t1);
-  output[7] = fdct_round_shift(t3);
-}
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
-  int r, c;
-  int16_t sum = 0;
-  for (r = 0; r < 8; ++r)
-    for (c = 0; c < 8; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum;
-  output[1] = 0;
-}
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
-  int i, j;
-  int16_t intermediate[64];
-
-  // Transform columns
-  {
-    int16_t *output = intermediate;
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
-
-    int i;
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = fdct_round_shift(t0);
-      output[2 * 8] = fdct_round_shift(t2);
-      output[4 * 8] = fdct_round_shift(t1);
-      output[6 * 8] = fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = fdct_round_shift(t0);
-      output[3 * 8] = fdct_round_shift(t2);
-      output[5 * 8] = fdct_round_shift(t1);
-      output[7 * 8] = fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &final_output[i * 8]);
-    for (j = 0; j < 8; ++j)
-      final_output[j + i * 8] /= 2;
-  }
-}
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
-  int r, c;
-  int16_t sum = 0;
-  for (r = 0; r < 16; ++r)
-    for (c = 0; c < 16; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum >> 1;
-  output[1] = 0;
-}
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  int16_t intermediate[256];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int step1[8];
-    /*canbe16*/ int step2[8];
-    /*canbe16*/ int step3[8];
-    /*canbe16*/ int input[8];
-    /*needs32*/ int temp1, temp2;
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
-        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
-        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
-        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
-        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
-        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
-        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
-        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
-        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
-        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
-        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
-        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
-        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
-        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
-        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
-        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
-        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
-        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
-        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
-        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
-        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
-        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
-        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
-        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
-        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
-        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-        /*needs32*/ int t0, t1, t2, t3;
-        /*canbe16*/ int x0, x1, x2, x3;
-
-        // stage 1
-        s0 = input[0] + input[7];
-        s1 = input[1] + input[6];
-        s2 = input[2] + input[5];
-        s3 = input[3] + input[4];
-        s4 = input[3] - input[4];
-        s5 = input[2] - input[5];
-        s6 = input[1] - input[6];
-        s7 = input[0] - input[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = fdct_round_shift(t0);
-        out[4] = fdct_round_shift(t2);
-        out[8] = fdct_round_shift(t1);
-        out[12] = fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = fdct_round_shift(t0);
-        out[6] = fdct_round_shift(t2);
-        out[10] = fdct_round_shift(t1);
-        out[14] = fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = fdct_round_shift(temp1);
-        out[9] = fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = fdct_round_shift(temp1);
-        out[13] = fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = fdct_round_shift(temp1);
-        out[11] = fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = fdct_round_shift(temp1);
-        out[15] = fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
-}
-
-static void fadst8(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
 
   // stage 1
   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
@@ -583,270 +333,36 @@
   x6 = fdct_round_shift(s6);
   x7 = fdct_round_shift(s7);
 
-  output[0] =   x0;
-  output[1] = - x4;
-  output[2] =   x6;
-  output[3] = - x2;
-  output[4] =   x3;
-  output[5] = - x7;
-  output[6] =   x5;
-  output[7] = - x1;
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
 }
 
-static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
-};
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
-void vp9_fht8x8_c(const int16_t *input, int16_t *output,
-                  int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_fdct8x8_c(input, output, stride);
-  } else {
-    int16_t out[64];
-    int16_t *outptr = &out[0];
-    int i, j;
-    int16_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-    }
-  }
-}
-
-/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
-   pixel. */
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
-  int i;
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0 * stride];
-    b1 = ip[1 * stride];
-    c1 = ip[2 * stride];
-    d1 = ip[3 * stride];
-
-    a1 += b1;
-    d1 = d1 - c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = a1;
-    op[4] = c1;
-    op[8] = d1;
-    op[12] = b1;
-
-    ip++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0];
-    b1 = ip[1];
-    c1 = ip[2];
-    d1 = ip[3];
-
-    a1 += b1;
-    d1 -= c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = a1 * UNIT_QUANT_FACTOR;
-    op[1] = c1 * UNIT_QUANT_FACTOR;
-    op[2] = d1 * UNIT_QUANT_FACTOR;
-    op[3] = b1 * UNIT_QUANT_FACTOR;
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-// Rewrote to use same algorithm as others.
-static void fdct16(const int16_t in[16], int16_t out[16]) {
-  /*canbe16*/ int step1[8];
-  /*canbe16*/ int step2[8];
-  /*canbe16*/ int step3[8];
-  /*canbe16*/ int input[8];
-  /*needs32*/ int temp1, temp2;
-
-  // step 1
-  input[0] = in[0] + in[15];
-  input[1] = in[1] + in[14];
-  input[2] = in[2] + in[13];
-  input[3] = in[3] + in[12];
-  input[4] = in[4] + in[11];
-  input[5] = in[5] + in[10];
-  input[6] = in[6] + in[ 9];
-  input[7] = in[7] + in[ 8];
-
-  step1[0] = in[7] - in[ 8];
-  step1[1] = in[6] - in[ 9];
-  step1[2] = in[5] - in[10];
-  step1[3] = in[4] - in[11];
-  step1[4] = in[3] - in[12];
-  step1[5] = in[2] - in[13];
-  step1[6] = in[1] - in[14];
-  step1[7] = in[0] - in[15];
-
-  // fdct8(step, step);
-  {
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
-
-    // stage 1
-    s0 = input[0] + input[7];
-    s1 = input[1] + input[6];
-    s2 = input[2] + input[5];
-    s3 = input[3] + input[4];
-    s4 = input[3] - input[4];
-    s5 = input[2] - input[5];
-    s6 = input[1] - input[6];
-    s7 = input[0] - input[7];
-
-    // fdct4(step, step);
-    x0 = s0 + s3;
-    x1 = s1 + s2;
-    x2 = s1 - s2;
-    x3 = s0 - s3;
-    t0 = (x0 + x1) * cospi_16_64;
-    t1 = (x0 - x1) * cospi_16_64;
-    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = fdct_round_shift(t0);
-    out[4] = fdct_round_shift(t2);
-    out[8] = fdct_round_shift(t1);
-    out[12] = fdct_round_shift(t3);
-
-    // Stage 2
-    t0 = (s6 - s5) * cospi_16_64;
-    t1 = (s6 + s5) * cospi_16_64;
-    t2 = fdct_round_shift(t0);
-    t3 = fdct_round_shift(t1);
-
-    // Stage 3
-    x0 = s4 + t2;
-    x1 = s4 - t2;
-    x2 = s7 - t3;
-    x3 = s7 + t3;
-
-    // Stage 4
-    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = fdct_round_shift(t0);
-    out[6] = fdct_round_shift(t2);
-    out[10] = fdct_round_shift(t1);
-    out[14] = fdct_round_shift(t3);
-  }
-
-  // step 2
-  temp1 = (step1[5] - step1[2]) * cospi_16_64;
-  temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = fdct_round_shift(temp1);
-  step2[3] = fdct_round_shift(temp2);
-  temp1 = (step1[4] + step1[3]) * cospi_16_64;
-  temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = fdct_round_shift(temp1);
-  step2[5] = fdct_round_shift(temp2);
-
-  // step 3
-  step3[0] = step1[0] + step2[3];
-  step3[1] = step1[1] + step2[2];
-  step3[2] = step1[1] - step2[2];
-  step3[3] = step1[0] - step2[3];
-  step3[4] = step1[7] - step2[4];
-  step3[5] = step1[6] - step2[5];
-  step3[6] = step1[6] + step2[5];
-  step3[7] = step1[7] + step2[4];
-
-  // step 4
-  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-  step2[1] = fdct_round_shift(temp1);
-  step2[2] = fdct_round_shift(temp2);
-  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = fdct_round_shift(temp1);
-  step2[6] = fdct_round_shift(temp2);
-
-  // step 5
-  step1[0] = step3[0] + step2[1];
-  step1[1] = step3[0] - step2[1];
-  step1[2] = step3[3] + step2[2];
-  step1[3] = step3[3] - step2[2];
-  step1[4] = step3[4] - step2[5];
-  step1[5] = step3[4] + step2[5];
-  step1[6] = step3[7] - step2[6];
-  step1[7] = step3[7] + step2[6];
-
-  // step 6
-  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = fdct_round_shift(temp1);
-  out[9] = fdct_round_shift(temp2);
-
-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = fdct_round_shift(temp1);
-  out[13] = fdct_round_shift(temp2);
-
-  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = fdct_round_shift(temp1);
-  out[11] = fdct_round_shift(temp2);
-
-  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = fdct_round_shift(temp1);
-  out[15] = fdct_round_shift(temp2);
-}
-
-static void fadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
 
   // stage 1
   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
@@ -972,24 +488,38 @@
   x14 = fdct_round_shift(s14);
   x15 = fdct_round_shift(s15);
 
-  output[0] = x0;
-  output[1] = - x8;
-  output[2] = x12;
-  output[3] = - x4;
-  output[4] = x6;
-  output[5] = x14;
-  output[6] = x10;
-  output[7] = x2;
-  output[8] = x3;
-  output[9] =  x11;
-  output[10] = x15;
-  output[11] = x7;
-  output[12] = x5;
-  output[13] = - x13;
-  output[14] = x9;
-  output[15] = - x1;
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
 }
 
+static const transform_2d FHT_4[] = {
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
+};
+
 static const transform_2d FHT_16[] = {
   { fdct16,  fdct16  },  // DCT_DCT  = 0
   { fadst16, fdct16  },  // ADST_DCT = 1
@@ -997,15 +527,244 @@
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct4x4_c(input, output, stride);
+  } else {
+    tran_low_t out[4 * 4];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        out[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j)
+      coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct8x8_c(input, output, stride);
+  } else {
+    tran_low_t out[64];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        out[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
-    vp9_fdct16x16_c(input, output, stride);
+    vpx_fdct16x16_c(input, output, stride);
   } else {
-    int16_t out[256];
-    int16_t *outptr = &out[0];
+    tran_low_t out[256];
     int i, j;
-    int16_t temp_in[16], temp_out[16];
+    tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
 
     // Columns
@@ -1014,7 +773,7 @@
         temp_in[j] = input[j * stride + i] * 4;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
     }
 
     // Rows
@@ -1028,403 +787,24 @@
   }
 }
 
-static INLINE int dct_32_round(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(-131072 <= rv && rv <= 131071);
-  return rv;
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp9_fht4x4_c(input, output, stride, tx_type);
 }
 
-static INLINE int half_round_shift(int input) {
-  int rv = (input + 1 + (input < 0)) >> 2;
-  return rv;
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp9_fht8x8_c(input, output, stride, tx_type);
 }
 
-static void fdct32(const int *input, int *output, int round) {
-  int step[32];
-  // Stage 1
-  step[0] = input[0] + input[(32 - 1)];
-  step[1] = input[1] + input[(32 - 2)];
-  step[2] = input[2] + input[(32 - 3)];
-  step[3] = input[3] + input[(32 - 4)];
-  step[4] = input[4] + input[(32 - 5)];
-  step[5] = input[5] + input[(32 - 6)];
-  step[6] = input[6] + input[(32 - 7)];
-  step[7] = input[7] + input[(32 - 8)];
-  step[8] = input[8] + input[(32 - 9)];
-  step[9] = input[9] + input[(32 - 10)];
-  step[10] = input[10] + input[(32 - 11)];
-  step[11] = input[11] + input[(32 - 12)];
-  step[12] = input[12] + input[(32 - 13)];
-  step[13] = input[13] + input[(32 - 14)];
-  step[14] = input[14] + input[(32 - 15)];
-  step[15] = input[15] + input[(32 - 16)];
-  step[16] = -input[16] + input[(32 - 17)];
-  step[17] = -input[17] + input[(32 - 18)];
-  step[18] = -input[18] + input[(32 - 19)];
-  step[19] = -input[19] + input[(32 - 20)];
-  step[20] = -input[20] + input[(32 - 21)];
-  step[21] = -input[21] + input[(32 - 22)];
-  step[22] = -input[22] + input[(32 - 23)];
-  step[23] = -input[23] + input[(32 - 24)];
-  step[24] = -input[24] + input[(32 - 25)];
-  step[25] = -input[25] + input[(32 - 26)];
-  step[26] = -input[26] + input[(32 - 27)];
-  step[27] = -input[27] + input[(32 - 28)];
-  step[28] = -input[28] + input[(32 - 29)];
-  step[29] = -input[29] + input[(32 - 30)];
-  step[30] = -input[30] + input[(32 - 31)];
-  step[31] = -input[31] + input[(32 - 32)];
-
-  // Stage 2
-  output[0] = step[0] + step[16 - 1];
-  output[1] = step[1] + step[16 - 2];
-  output[2] = step[2] + step[16 - 3];
-  output[3] = step[3] + step[16 - 4];
-  output[4] = step[4] + step[16 - 5];
-  output[5] = step[5] + step[16 - 6];
-  output[6] = step[6] + step[16 - 7];
-  output[7] = step[7] + step[16 - 8];
-  output[8] = -step[8] + step[16 - 9];
-  output[9] = -step[9] + step[16 - 10];
-  output[10] = -step[10] + step[16 - 11];
-  output[11] = -step[11] + step[16 - 12];
-  output[12] = -step[12] + step[16 - 13];
-  output[13] = -step[13] + step[16 - 14];
-  output[14] = -step[14] + step[16 - 15];
-  output[15] = -step[15] + step[16 - 16];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = step[18];
-  output[19] = step[19];
-
-  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
-  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
-  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
-  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
-  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
-  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
-  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
-  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
-  output[28] = step[28];
-  output[29] = step[29];
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // dump the magnitude by 4, hence the intermediate values are within
-  // the range of 16 bits.
-  if (round) {
-    output[0] = half_round_shift(output[0]);
-    output[1] = half_round_shift(output[1]);
-    output[2] = half_round_shift(output[2]);
-    output[3] = half_round_shift(output[3]);
-    output[4] = half_round_shift(output[4]);
-    output[5] = half_round_shift(output[5]);
-    output[6] = half_round_shift(output[6]);
-    output[7] = half_round_shift(output[7]);
-    output[8] = half_round_shift(output[8]);
-    output[9] = half_round_shift(output[9]);
-    output[10] = half_round_shift(output[10]);
-    output[11] = half_round_shift(output[11]);
-    output[12] = half_round_shift(output[12]);
-    output[13] = half_round_shift(output[13]);
-    output[14] = half_round_shift(output[14]);
-    output[15] = half_round_shift(output[15]);
-
-    output[16] = half_round_shift(output[16]);
-    output[17] = half_round_shift(output[17]);
-    output[18] = half_round_shift(output[18]);
-    output[19] = half_round_shift(output[19]);
-    output[20] = half_round_shift(output[20]);
-    output[21] = half_round_shift(output[21]);
-    output[22] = half_round_shift(output[22]);
-    output[23] = half_round_shift(output[23]);
-    output[24] = half_round_shift(output[24]);
-    output[25] = half_round_shift(output[25]);
-    output[26] = half_round_shift(output[26]);
-    output[27] = half_round_shift(output[27]);
-    output[28] = half_round_shift(output[28]);
-    output[29] = half_round_shift(output[29]);
-    output[30] = half_round_shift(output[30]);
-    output[31] = half_round_shift(output[31]);
-  }
-
-  // Stage 3
-  step[0] = output[0] + output[(8 - 1)];
-  step[1] = output[1] + output[(8 - 2)];
-  step[2] = output[2] + output[(8 - 3)];
-  step[3] = output[3] + output[(8 - 4)];
-  step[4] = -output[4] + output[(8 - 5)];
-  step[5] = -output[5] + output[(8 - 6)];
-  step[6] = -output[6] + output[(8 - 7)];
-  step[7] = -output[7] + output[(8 - 8)];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
-  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
-  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
-  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  step[16] = output[16] + output[23];
-  step[17] = output[17] + output[22];
-  step[18] = output[18] + output[21];
-  step[19] = output[19] + output[20];
-  step[20] = -output[20] + output[19];
-  step[21] = -output[21] + output[18];
-  step[22] = -output[22] + output[17];
-  step[23] = -output[23] + output[16];
-  step[24] = -output[24] + output[31];
-  step[25] = -output[25] + output[30];
-  step[26] = -output[26] + output[29];
-  step[27] = -output[27] + output[28];
-  step[28] = output[28] + output[27];
-  step[29] = output[29] + output[26];
-  step[30] = output[30] + output[25];
-  step[31] = output[31] + output[24];
-
-  // Stage 4
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = -step[2] + step[1];
-  output[3] = -step[3] + step[0];
-  output[4] = step[4];
-  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
-  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = -step[10] + step[9];
-  output[11] = -step[11] + step[8];
-  output[12] = -step[12] + step[15];
-  output[13] = -step[13] + step[14];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
-  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
-  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
-  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
-  output[22] = step[22];
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = step[25];
-  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
-  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
-  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
-  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // Stage 5
-  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
-  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
-  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
-  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
-  step[4] = output[4] + output[5];
-  step[5] = -output[5] + output[4];
-  step[6] = -output[6] + output[7];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
-  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
-  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
-  step[15] = output[15];
-
-  step[16] = output[16] + output[19];
-  step[17] = output[17] + output[18];
-  step[18] = -output[18] + output[17];
-  step[19] = -output[19] + output[16];
-  step[20] = -output[20] + output[23];
-  step[21] = -output[21] + output[22];
-  step[22] = output[22] + output[21];
-  step[23] = output[23] + output[20];
-  step[24] = output[24] + output[27];
-  step[25] = output[25] + output[26];
-  step[26] = -output[26] + output[25];
-  step[27] = -output[27] + output[24];
-  step[28] = -output[28] + output[31];
-  step[29] = -output[29] + output[30];
-  step[30] = output[30] + output[29];
-  step[31] = output[31] + output[28];
-
-  // Stage 6
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
-  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
-  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
-  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
-  output[8] = step[8] + step[9];
-  output[9] = -step[9] + step[8];
-  output[10] = -step[10] + step[11];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = -step[13] + step[12];
-  output[14] = -step[14] + step[15];
-  output[15] = step[15] + step[14];
-
-  output[16] = step[16];
-  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
-  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
-  output[19] = step[19];
-  output[20] = step[20];
-  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
-  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
-  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
-  output[27] = step[27];
-  output[28] = step[28];
-  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
-  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
-  output[31] = step[31];
-
-  // Stage 7
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
-  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
-  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
-  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
-  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
-  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
-  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
-  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
-  step[16] = output[16] + output[17];
-  step[17] = -output[17] + output[16];
-  step[18] = -output[18] + output[19];
-  step[19] = output[19] + output[18];
-  step[20] = output[20] + output[21];
-  step[21] = -output[21] + output[20];
-  step[22] = -output[22] + output[23];
-  step[23] = output[23] + output[22];
-  step[24] = output[24] + output[25];
-  step[25] = -output[25] + output[24];
-  step[26] = -output[26] + output[27];
-  step[27] = output[27] + output[26];
-  step[28] = output[28] + output[29];
-  step[29] = -output[29] + output[28];
-  step[30] = -output[30] + output[31];
-  step[31] = output[31] + output[30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[0]  = step[0];
-  output[16] = step[1];
-  output[8]  = step[2];
-  output[24] = step[3];
-  output[4]  = step[4];
-  output[20] = step[5];
-  output[12] = step[6];
-  output[28] = step[7];
-  output[2]  = step[8];
-  output[18] = step[9];
-  output[10] = step[10];
-  output[26] = step[11];
-  output[6]  = step[12];
-  output[22] = step[13];
-  output[14] = step[14];
-  output[30] = step[15];
-
-  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
-  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
-  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
-  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
-  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
-  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
-  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
-  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
-  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
-  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
-  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
-  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp9_fwht4x4_c(input, output, stride);
 }
 
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
-  int r, c;
-  int16_t sum = 0;
-  for (r = 0; r < 32; ++r)
-    for (c = 0; c < 32; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum >> 3;
-  output[1] = 0;
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
+                           int stride, int tx_type) {
+  vp9_fht16x16_c(input, output, stride, tx_type);
 }
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
-  int i, j;
-  int output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-  }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
-  int i, j;
-  int output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] = temp_out[j];
-  }
-}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c
index 90ea9cc..5f99285 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libvpx/vp9/encoder/vp9_denoiser.c

@@ -10,6 +10,7 @@
 
 #include <assert.h>
 #include <limits.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -31,9 +32,6 @@
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
 
-static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
-static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};
-
 static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
   (void)bs;
   return 3 + (increase_denoising ? 1 : 0);
@@ -48,37 +46,37 @@
 static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
   (void)bs;
   (void)increase_denoising;
-  return 25 * 25;
+  return 625;
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
-                           int mv_row, int mv_col) {
-  if (mv_row * mv_row + mv_col * mv_col >
+                           int motion_magnitude) {
+  if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
     return 0;
   } else {
-    return widths[bs] * heights[bs] * 20;
+    return (1 << num_pels_log2_lookup[bs]) * 20;
   }
 }
 
-static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
-}
-
 static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
-static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
-                                             const uint8_t *mc_avg,
-                                             int mc_avg_stride,
-                                             uint8_t *avg, int avg_stride,
-                                             int increase_denoising,
-                                             BLOCK_SIZE bs) {
+// TODO(jackychen): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg,
+                          int mc_avg_stride,
+                          uint8_t *avg, int avg_stride,
+                          int increase_denoising,
+                          BLOCK_SIZE bs,
+                          int motion_magnitude) {
   int r, c;
   const uint8_t *sig_start = sig;
   const uint8_t *mc_avg_start = mc_avg;
@@ -86,10 +84,23 @@
   int diff, adj, absdiff, delta;
   int adj_val[] = {3, 4, 6};
   int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
 
   // First attempt to apply a strong temporal denoising filter.
-  for (r = 0; r < heights[bs]; ++r) {
-    for (c = 0; c < widths[bs]; ++c) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
       diff = mc_avg[c] - sig[c];
       absdiff = abs(diff);
 
@@ -129,27 +140,34 @@
 
   // Otherwise, we try to dampen the filter if the delta is not too high.
   delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
-           >> 8) + 1;
-  if (delta > delta_thresh(bs, increase_denoising)) {
+           >> num_pels_log2_lookup[bs]) + 1;
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
   }
 
   mc_avg =  mc_avg_start;
   avg = avg_start;
   sig = sig_start;
-  for (r = 0; r < heights[bs]; ++r) {
-    for (c = 0; c < widths[bs]; ++c) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
       diff = mc_avg[c] - sig[c];
       adj = abs(diff);
       if (adj > delta) {
         adj = delta;
       }
       if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
         avg[c] = MAX(0, avg[c] - adj);
-        total_adj += adj;
-      } else {
-        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
         total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
       }
     }
     sig += sig_stride;
@@ -169,53 +187,36 @@
   return framebuf + (stride * mi_row * 8) + (mi_col * 8);
 }
 
-static void copy_block(uint8_t *dest, int dest_stride,
-                       const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
-  int r;
-  for (r = 0; r < heights[bs]; ++r) {
-    vpx_memcpy(dest, src, widths[bs]);
-    dest += dest_stride;
-    src += src_stride;
-  }
-}
-
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          MACROBLOCK *mb,
                                                          BLOCK_SIZE bs,
                                                          int increase_denoising,
                                                          int mi_row,
                                                          int mi_col,
-                                                         PICK_MODE_CONTEXT *ctx
+                                                         PICK_MODE_CONTEXT *ctx,
+                                                         int *motion_magnitude
                                                          ) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
-
   MB_MODE_INFO saved_mbmi;
   int i, j;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
 
-  // We will restore these after motion compensation.
-  saved_mbmi = *mbmi;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
-    }
-    saved_dst[i] = filter_mbd->plane[i].dst;
-  }
-
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
-
+  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
   frame = ctx->best_reference_frame;
 
+  saved_mbmi = *mbmi;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
-      sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) {
+      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
     mbmi->ref_frame[0] = ctx->best_reference_frame;
     mbmi->mode = ctx->best_sse_inter_mode;
     mbmi->mv[0] = ctx->best_sse_mv;
@@ -232,6 +233,26 @@
     ctx->newmv_sse = ctx->zeromv_sse;
   }
 
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+  if (*motion_magnitude >
+     (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
   for (j = 0; j < 2; ++j) {
@@ -284,19 +305,13 @@
   mv_row = ctx->best_sse_mv.as_mv.row;
   mv_col = ctx->best_sse_mv.as_mv.col;
 
-  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
-  if (mv_row * mv_row + mv_col * mv_col >
-      8 * noise_motion_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
   return FILTER_BLOCK;
 }
 
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
+  int motion_magnitude = 0;
   VP9_DENOISER_DECISION decision = FILTER_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -307,19 +322,26 @@
 
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
-                                         mi_row, mi_col, ctx);
+                                         mi_row, mi_col, ctx,
+                                         &motion_magnitude);
 
   if (decision == FILTER_BLOCK) {
-    decision = denoiser_filter(src.buf, src.stride,
-                               mc_avg_start, mc_avg.y_stride,
-                               avg_start, avg.y_stride,
-                               0, bs);
+    decision = vp9_denoiser_filter(src.buf, src.stride,
+                                 mc_avg_start, mc_avg.y_stride,
+                                 avg_start, avg.y_stride,
+                                 0, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
-    copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   }
 }
 
@@ -327,16 +349,26 @@
   int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
+
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
   for (r = 0; r < dest.y_height; ++r) {
-    vpx_memcpy(destbuf, srcbuf, dest.y_width);
+    memcpy(destbuf, srcbuf, dest.y_width);
     destbuf += dest.y_stride;
     srcbuf += src.y_stride;
   }
 }
 
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
@@ -346,22 +378,23 @@
   if (frame_type == KEY_FRAME) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i) {
+    for (i = 1; i < MAX_REF_FRAMES; ++i)
       copy_frame(denoiser->running_avg_y[i], src);
-    }
-  } else {  /* For non key frames */
-    if (refresh_alt_ref_frame) {
-      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_golden_frame) {
-      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_last_frame) {
-      copy_frame(denoiser->running_avg_y[LAST_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
+    return;
+  }
+
+  /* For non key frames */
+  if (refresh_alt_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_golden_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_last_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
 }
 
@@ -370,8 +403,8 @@
   ctx->newmv_sse = UINT_MAX;
 }
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
-                                     unsigned int sse, PREDICTION_MODE mode,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+                                     PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
   // TODO(tkopp): Use both MVs if possible
   if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
@@ -379,7 +412,7 @@
     ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
   }
 
-  if (mode == NEWMV) {
+  if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
     ctx->newmv_sse = sse;
     ctx->best_sse_inter_mode = mode;
     ctx->best_sse_mv = mbmi->mv[0];
@@ -388,13 +421,22 @@
 }
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int ssx, int ssy, int border) {
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border) {
   int i, fail;
+  const int legacy_byte_alignment = 0;
   assert(denoiser != NULL);
 
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy, border);
+    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                  use_highbitdepth,
+#endif
+                                  border, legacy_byte_alignment);
     if (fail) {
       vp9_denoiser_free(denoiser);
       return 1;
@@ -404,8 +446,12 @@
 #endif
   }
 
-  fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
-                                ssx, ssy, border);
+  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
   if (fail) {
     vp9_denoiser_free(denoiser);
     return 1;
@@ -414,23 +460,21 @@
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->increase_denoising = 0;
+  denoiser->frame_buffer_initialized = 1;
 
   return 0;
 }
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
+  denoiser->frame_buffer_initialized = 0;
   if (denoiser == NULL) {
     return;
   }
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    if (&denoiser->running_avg_y[i] != NULL) {
-      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
-    }
+    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
-  if (&denoiser->mc_running_avg_y != NULL) {
-    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
-  }
+  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
 }
 
 #ifdef OUTPUT_YUV_DENOISED
@@ -439,15 +483,13 @@
   uint8_t *u = yuv->u_buffer;
   uint8_t *v = yuv->v_buffer;
 
-  // The '/2's are there because we have a 440 buffer, but we want to output
-  // 420.
-  for (r = 0; r < yuv->uv_height / 2; ++r) {
-    for (c = 0; c < yuv->uv_width / 2; ++c) {
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
       u[c] = UINT8_MAX / 2;
       v[c] = UINT8_MAX / 2;
     }
-    u += yuv->uv_stride + yuv->uv_width / 2;
-    v += yuv->uv_stride + yuv->uv_width / 2;
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
   }
 }
 #endif

diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h
index d93846f..b2af792 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libvpx/vp9/encoder/vp9_denoiser.h

@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK
@@ -27,6 +29,7 @@
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
   int increase_denoising;
+  int frame_buffer_initialized;
 } VP9_DENOISER;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -42,12 +45,25 @@
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
                                      unsigned int sse, PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int ssx, int ssy, int border);
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where vp9_denoiser.h
+// is referenced.
+static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 

diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 4e7b8e4..295a751 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c

@@ -13,9 +13,12 @@
 #include <stdio.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
+#include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -27,7 +30,6 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/encoder/vp9_aq_complexity.h"
@@ -36,6 +38,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rd.h"
@@ -43,60 +46,115 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-#define GF_ZEROMV_ZBIN_BOOST 0
-#define LF_ZEROMV_ZBIN_BOOST 0
-#define MV_ZBIN_BOOST        0
-#define SPLIT_MV_ZBIN_BOOST  0
-#define INTRA_ZBIN_BOOST     0
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
+                              TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx);
 
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
 static const uint8_t VP9_VAR_OFFS[64] = {
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128,
-  128, 128, 128, 128, 128, 128, 128, 128
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
-                                              const struct buf_2d *ref,
-                                              BLOCK_SIZE bs) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
                                               VP9_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(
+    VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               0, &sse);
+      break;
+    case 12:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               0, &sse);
+      break;
+    case 8:
+    default:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
                                                    BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
   const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride +
-                                              mi_col * MI_SIZE];
-  unsigned int sse;
-  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                                              last_y, last->y_stride, &sse);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
                                                    int mi_row,
                                                    int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
                                                     mi_row, mi_col,
                                                     BLOCK_64X64);
   if (var < 8)
@@ -109,34 +167,22 @@
     return BLOCK_8X8;
 }
 
-static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
-                                                      int mi_row,
-                                                      int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
-                                                    mi_row, mi_col,
-                                                    BLOCK_64X64);
-  if (var < 4)
-    return BLOCK_64X64;
-  else if (var < 10)
-    return BLOCK_32X32;
-  else
-    return BLOCK_16X16;
-}
-
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
-                                        MACROBLOCKD *const xd,
-                                        int mi_row,
-                                        int mi_col) {
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+                                         MACROBLOCK *const x,
+                                         MACROBLOCKD *const xd,
+                                         int mi_row,
+                                         int mi_col) {
   const int idx_str = xd->mi_stride * mi_row + mi_col;
   xd->mi = cm->mi_grid_visible + idx_str;
   xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 }
 
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
-                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
@@ -146,7 +192,7 @@
 
   set_skip_context(xd, mi_row, mi_col);
 
-  set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+  set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
   mbmi = &xd->mi[0]->mbmi;
 
@@ -177,7 +223,7 @@
     if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
     vp9_init_plane_quantizers(cpi, x);
 
@@ -186,6 +232,9 @@
     mbmi->segment_id = 0;
     x->encode_breakout = cpi->encode_breakout;
   }
+
+  // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
+  xd->tile = *tile;
 }
 
 static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -202,20 +251,20 @@
 }
 
 static void set_block_size(VP9_COMP * const cpi,
+                           MACROBLOCK *const x,
+                           MACROBLOCKD *const xd,
                            int mi_row, int mi_col,
                            BLOCK_SIZE bsize) {
   if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
-    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-    set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
+    set_mode_info_offsets(&cpi->common, x, xd, mi_row, mi_col);
     xd->mi[0]->mbmi.sb_type = bsize;
-    duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
   }
 }
 
 typedef struct {
   int64_t sum_square_error;
   int64_t sum_error;
-  int count;
+  int log2_count;
   int variance;
 } var;
 
@@ -228,6 +277,11 @@
 typedef struct {
   partition_variance part_variances;
   var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
 } v8x8;
 
 typedef struct {
@@ -259,7 +313,6 @@
 static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
   int i;
   node->part_variances = NULL;
-  vpx_memset(node->split, 0, sizeof(node->split));
   switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
@@ -286,6 +339,13 @@
       v8x8 *vt = (v8x8 *) data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i];
       break;
     }
@@ -300,22 +360,23 @@
 static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
-  v->count = c;
-  if (c > 0)
-    v->variance = (int)(256 *
-                        (v->sum_square_error - v->sum_error * v->sum_error /
-                         v->count) / v->count);
-  else
-    v->variance = 0;
+  v->log2_count = c;
 }
 
-void sum_2_variances(const var *a, const var *b, var *r) {
+static void get_variance(var *v) {
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->count + b->count, r);
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
 }
 
 static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
   variance_node node;
+  memset(&node, 0, sizeof(node));
   tree_to_node(data, bsize, &node);
   sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
   sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
@@ -326,73 +387,305 @@
 }
 
 static int set_vt_partitioning(VP9_COMP *cpi,
+                               MACROBLOCK *const x,
+                               MACROBLOCKD *const xd,
                                void *data,
                                BLOCK_SIZE bsize,
                                int mi_row,
-                               int mi_col) {
+                               int mi_col,
+                               int64_t threshold,
+                               BLOCK_SIZE bsize_min,
+                               int force_split) {
   VP9_COMMON * const cm = &cpi->common;
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  // TODO(debargha): Choose this more intelligently.
-  const int64_t threshold_multiplier = 25;
-  int64_t threshold = threshold_multiplier * cpi->common.base_qindex;
-  assert(block_height == block_width);
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
 
+  assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  // Split none is available only if we have more than half a block size
-  // in width and height inside the visible image.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->none.variance < threshold) {
-    set_block_size(cpi, mi_row, mi_col, bsize);
-    return 1;
-  }
+  if (force_split == 1)
+    return 0;
 
-  // Vertical split is available on all but the bottom border.
-  if (mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->vert[0].variance < threshold &&
-      vt.part_variances->vert[1].variance < threshold) {
-    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-    set_block_size(cpi, mi_row, mi_col, subsize);
-    set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
-    return 1;
-  }
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
 
-  // Horizontal split is available on all but the right border.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      vt.part_variances->horz[0].variance < threshold &&
-      vt.part_variances->horz[1].variance < threshold) {
-    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-    set_block_size(cpi, mi_row, mi_col, subsize);
-    set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
-    return 1;
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
   }
   return 0;
 }
 
-// TODO(debargha): Fix this function and make it work as expected.
-static void choose_partitioning(VP9_COMP *cpi,
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int64_t threshold_base = (int64_t)(threshold_multiplier *
+      cpi->y_dequant[q][1]);
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base >> 2;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base << 2;
+  } else {
+    thresholds[1] = threshold_base;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[0] = threshold_base >> 2;
+      thresholds[2] = threshold_base << 3;
+    } else {
+      thresholds[0] = threshold_base;
+      thresholds[1] = (5 * threshold_base) >> 2;
+      if (cm->width >= 1920 && cm->height >= 1080)
+        thresholds[1] = (7 * threshold_base) >> 2;
+      thresholds[2] = threshold_base << cpi->oxcf.speed;
+    }
+  }
+}
+
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  if (sf->partition_search_type != VAR_BASED_PARTITION &&
+      sf->partition_search_type != REFERENCE_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 100;
+      else
+        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
+            (cpi->y_dequant[q][1] << 1) : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp,
+                              &min, &max);
+      } else {
+        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                       d + y8_idx * dp + x8_idx, dp,
+                       &min, &max);
+      }
+#else
+      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                     d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max)
+        minmax_max = (max - min);
+      if ((max - min) < minmax_min)
+        minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
+                                MACROBLOCK *x,
                                 int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  int i, j, k;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int i, j, k, m;
   v64x64 vt;
+  v16x16 vt2[16];
+  int force_split[21];
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
   int pixels_wide = 64, pixels_high = 64;
-  int_mv nearest_mv, near_mv;
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
 
-  vp9_zero(vt);
-  set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+  // Always use 4x4 partition for key frame.
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int use_4x4_partition = is_key_frame;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[16];
+
+  int segment_id = CR_SEGMENT_ID_BASE;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+                                                    cm->last_frame_seg_map;
+    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      set_vbp_thresholds(cpi, thresholds, q);
+    }
+  }
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -402,115 +695,281 @@
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
-  if (cm->frame_type != KEY_FRAME) {
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+  if (!is_key_frame && !(is_one_pass_cbr_svc(cpi) &&
+      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+    // In the case of spatial/temporal scalable coding, the assumption here is
+    // that the temporal reference frame will always be of type LAST_FRAME.
+    // TODO(marpan): If that assumption is broken, we need to revisit this code.
+    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+    unsigned int uv_sad;
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
-    xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->mi[0]->mbmi.sb_type = BLOCK_64X64;
-    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                          xd->mi[0]->mbmi.ref_mvs[LAST_FRAME],
-                          &nearest_mv, &near_mv);
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    unsigned int y_sad, y_sad_g;
+    const BLOCK_SIZE bsize = BLOCK_32X32
+        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
 
-    xd->mi[0]->mbmi.mv[0] = nearest_mv;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+    assert(yv12 != NULL);
+
+    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) {
+      // For now, GOLDEN will not be used for non-zero spatial layers, since
+      // it may not be a temporal reference.
+      yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    }
+
+    if (yv12_g && yv12_g != yv12 &&
+       (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       xd->plane[0].pre[0].buf,
+                                       xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[LAST_FRAME - 1].sf);
+    mbmi->ref_frame[0] = LAST_FRAME;
+    mbmi->ref_frame[1] = NONE;
+    mbmi->sb_type = BLOCK_64X64;
+    mbmi->mv[0].as_int = 0;
+    mbmi->interp_filter = BILINEAR;
+
+    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    if (y_sad_g < y_sad) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mbmi->ref_frame[0] = GOLDEN_FRAME;
+      mbmi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+    } else {
+      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+    }
+
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+
+    for (i = 1; i <= 2; ++i) {
+      struct macroblock_plane  *p = &x->plane[i];
+      struct macroblockd_plane *pd = &xd->plane[i];
+      const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+      if (bs == BLOCK_INVALID)
+        uv_sad = UINT_MAX;
+      else
+        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                     pd->dst.buf, pd->dst.stride);
+
+      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    }
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE &&
+        y_sad < cpi->vbp_threshold_sad) {
+      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      if (mi_col + block_width / 2 < cm->mi_cols &&
+          mi_row + block_height / 2 < cm->mi_rows) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+        return 0;
+      }
+    }
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          break;
+        case 12:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          break;
+        case 8:
+        default:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Fill in the entire tree of 8x8 variances for splits.
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
     const int y32_idx = ((i >> 1) << 5);
+    const int i2 = i << 2;
+    force_split[i + 1] = 0;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
+      const int split_index = 5 + i2 + j;
       v16x16 *vst = &vt.split[i].split[j];
-      for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-        if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
-                        d + y_idx * dp + x_idx, dp, &sse, &sum);
-        fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
-      }
-    }
-  }
-  // Fill the rest of the variance tree by summing split partition values.
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-  }
-  fill_variance_tree(&vt, BLOCK_64X64);
-
-  // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64,
-                           mi_row, mi_col)) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx))) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          // NOTE: This is a temporary hack to disable 8x8 partitions,
-          // since it works really bad - possibly due to a bug
-#define DISABLE_8X8_VAR_BASED_PARTITION
-#ifdef DISABLE_8X8_VAR_BASED_PARTITION
-          if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows &&
-              mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) {
-            set_block_size(cpi,
-                           (mi_row + y32_idx + y16_idx),
-                           (mi_col + x32_idx + x16_idx),
-                           BLOCK_16X16);
-          } else {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              set_block_size(cpi,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
-            }
-          }
-#else
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], tile,
-                                   BLOCK_16X16,
-                                   (mi_row + y32_idx + y16_idx),
-                                   (mi_col + x32_idx + x16_idx), 2)) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              set_block_size(cpi,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
-            }
-          }
+      force_split[split_index] = 0;
+      variance4x4downsample[i2 + j] = 0;
+      if (!is_key_frame) {
+        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                            xd->cur_buf->flags,
 #endif
+                            pixels_wide,
+                            pixels_high,
+                            is_key_frame);
+        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+        get_variance(&vt.split[i].split[j].part_variances.none);
+        if (vt.split[i].split[j].part_variances.none.variance >
+            thresholds[2]) {
+          // 16X16 variance is above threshold for split, so force split to 8x8
+          // for this 16x16 block (this also forces splits for upper levels).
+          force_split[split_index] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        } else if (vt.split[i].split[j].part_variances.none.variance >
+                   thresholds[1] &&
+                   !cyclic_refresh_segment_id_boosted(segment_id)) {
+          // We have some nominal amount of 16x16 variance (based on average),
+          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+          // force split to 8x8 block for this 16x16 block.
+          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                          xd->cur_buf->flags,
+#endif
+                                          pixels_wide, pixels_high);
+          if (minmax > cpi->vbp_threshold_minmax) {
+            force_split[split_index] = 1;
+            force_split[i + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      // TODO(marpan): There is an issue with variance based on 4x4 average in
+      // svc mode, don't allow it for now.
+      if (is_key_frame || (low_res && !cpi->use_svc &&
+          vt.split[i].split[j].part_variances.none.variance >
+          (thresholds[1] << 1))) {
+        force_split[split_index] = 0;
+        // Go down to 4x4 down-sampling for variance.
+        variance4x4downsample[i2 + j] = 1;
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
+              &vt2[i2 + j].split[k];
+          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               xd->cur_buf->flags,
+#endif
+                               pixels_wide,
+                               pixels_high,
+                               is_key_frame);
         }
       }
     }
   }
+
+  // Fill the rest of the variance tree by summing split partition values.
+  for (i = 0; i < 4; i++) {
+    const int i2 = i << 2;
+    for (j = 0; j < 4; j++) {
+      if (variance4x4downsample[i2 + j] == 1) {
+        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
+            &vt.split[i].split[j];
+        for (m = 0; m < 4; m++)
+          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+        fill_variance_tree(vtemp, BLOCK_16X16);
+      }
+    }
+    fill_variance_tree(&vt.split[i], BLOCK_32X32);
+    // If variance of this 32x32 block is above the threshold, force the block
+    // to split. This also forces a split on the upper (64x64) level.
+    if (!force_split[i + 1]) {
+      get_variance(&vt.split[i].part_variances.none);
+      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
+      }
+    }
+  }
+  if (!force_split[0]) {
+    fill_variance_tree(&vt, BLOCK_64X64);
+    get_variance(&vt.part_variances.none);
+  }
+
+  // Now go through the entire structure, splitting every block size until
+  // we get to one that's got a variance lower than our threshold.
+  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
+      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      const int i2 = i << 2;
+      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx),
+                               thresholds[1], BLOCK_16X16,
+                               force_split[i + 1])) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+          // block, then the variance is based on 4x4 down-sampling, so use vt2
+          // in set_vt_partioning(), otherwise use vt.
+          v16x16 *vtemp = (!is_key_frame &&
+                           variance4x4downsample[i2 + j] == 1) ?
+                           &vt2[i2 + j] : &vt.split[i].split[j];
+          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
+                                   mi_row + y32_idx + y16_idx,
+                                   mi_col + x32_idx + x16_idx,
+                                   thresholds[2],
+                                   cpi->vbp_bsize_min,
+                                   force_split[5 + i2  + j])) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              if (use_4x4_partition) {
+                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx,
+                                         thresholds[3], BLOCK_8X8, 0)) {
+                  set_block_size(cpi, x, xd,
+                                 (mi_row + y32_idx + y16_idx + y8_idx),
+                                 (mi_col + x32_idx + x16_idx + x8_idx),
+                                 BLOCK_4X4);
+                }
+              } else {
+                set_block_size(cpi, x, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
 }
 
-static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state(VP9_COMP *cpi, ThreadData *td,
+                         PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
   int i, x_idx, y;
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
-  MACROBLOCK *const x = &cpi->mb;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
@@ -518,6 +977,13 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
@@ -527,22 +993,23 @@
   assert(mi->mbmi.sb_type == bsize);
 
   *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
 
   // If segmentation in use
-  if (seg->enabled && output_enabled) {
+  if (seg->enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       mi_addr->mbmi.segment_id =
-        vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+        get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
-                                        mi_row, mi_col, bsize, 1);
-      vp9_init_plane_quantizers(cpi, x);
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
+                                        mi_col, bsize, ctx->rate, ctx->dist,
+                                        x->skip);
     }
   }
 
@@ -573,31 +1040,18 @@
   if (cpi->oxcf.aq_mode)
     vp9_init_plane_quantizers(cpi, x);
 
-  // FIXME(rbultje) I'm pretty sure this should go to the end of this block
-  // (i.e. after the output_enabled)
-  if (bsize < BLOCK_32X32) {
-    if (bsize < BLOCK_16X16)
-      ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
-    ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
-  }
-
   if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   }
 
   x->skip = ctx->skip;
-  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
-             sizeof(uint8_t) * ctx->num_4x4_blk);
+  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+         sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
   if (!output_enabled)
     return;
 
-  if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    for (i = 0; i < TX_MODES; i++)
-      rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i];
-  }
-
 #if CONFIG_INTERNAL_STATS
   if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
@@ -620,20 +1074,31 @@
 #endif
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
-      vp9_update_mv_count(cm, xd);
+      vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
         const int ctx = vp9_get_pred_context_switchable_interp(xd);
-        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+        ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
       }
     }
 
-    rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
+      rdc->filter_diff[i] += ctx->best_filter_diff[i];
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
   }
 }
 
@@ -652,8 +1117,8 @@
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
-                                   int64_t *dist, BLOCK_SIZE bsize) {
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   INTERP_FILTER filter_ref;
@@ -679,45 +1144,42 @@
   xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
 
-  *rate = 0;
-  *dist = 0;
+  vp9_rd_cost_init(rd_cost);
 }
 
-static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, int mi_col,
-                             int *totalrate, int64_t *totaldist,
-                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                             int64_t best_rd, int block) {
+static int set_segment_rdmult(VP9_COMP *const cpi,
+                               MACROBLOCK *const x,
+                               int8_t segment_id) {
+  int segment_qindex;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  vp9_init_plane_quantizers(cpi, x);
+  vpx_clear_system_state();
+  segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
+                                  cm->base_qindex);
+  return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
+                             MACROBLOCK *const x,
+                             int mi_row, int mi_col, RD_COST *rd_cost,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
-  double rdmult_ratio;
 
-  vp9_clear_system_state();
-  rdmult_ratio = 1.0;  // avoid uninitialized warnings
+  vpx_clear_system_state();
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
 
-  // TODO(JBB): Most other places in the code instead of calling the function
-  // and then checking if its not the first 8x8 we put the check in the
-  // calling function.  Do that here.
-  if (bsize < BLOCK_8X8) {
-    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
-    // there is nothing to be done.
-    if (block != 0) {
-      *totalrate = 0;
-      *totaldist = 0;
-      return;
-    }
-  }
-
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
 
@@ -728,12 +1190,26 @@
     p[i].eobs = ctx->eobs_pbuf[i][0];
   }
   ctx->is_coded = 0;
+  ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
   mbmi->skip = 0;
 
-  x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance =
+        vp9_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
+                                           bsize, xd->bd);
+  } else {
+    x->source_variance =
+      vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
+  x->source_variance =
+    vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
@@ -748,75 +1224,79 @@
     } else {
       const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
                                                     : cm->last_frame_seg_map;
-      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-
-    rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
-    vp9_init_plane_quantizers(cpi, x);
-    vp9_clear_system_state();
-    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
-    const int mi_offset = mi_row * cm->mi_cols + mi_col;
-    unsigned char complexity = cpi->complexity_map[mi_offset];
-    const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
-                        (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
-    if (!is_edge && (complexity > 128))
-      x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == CYCLIC_REFRESH_AQ) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
                                                   : cm->last_frame_seg_map;
-    // If segment 1, use rdmult for that segment.
-    if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            get_segment_id(cm, map, bsize, mi_row, mi_col)))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
-    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
-                              best_rd);
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, totalrate, totaldist, bsize,
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
-        vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
-                                  totalrate, totaldist, bsize, ctx, best_rd);
+        vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
+                                  rd_cost, bsize, ctx, best_rd);
     } else {
-      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
-                                    totaldist, bsize, ctx, best_rd);
+      vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                    rd_cost, bsize, ctx, best_rd);
     }
   }
 
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) &&
+      (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME ||
+       cpi->refresh_alt_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
   x->rdmult = orig_rdmult;
 
-  if (aq_mode == VARIANCE_AQ && *totalrate != INT_MAX) {
-    vp9_clear_system_state();
-    *totalrate = (int)round(*totalrate * rdmult_ratio);
-  }
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX)
+    rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
 }
 
-static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
+static void update_stats(VP9_COMMON *cm, ThreadData *td) {
+  const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
   if (!frame_is_intra_only(cm)) {
-    const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                                     SEG_LVL_REF_FRAME);
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mbmi);
+    const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
+                                                 SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
-      FRAME_COUNTS *const counts = &cm->counts;
-      const int inter_block = is_inter_block(mbmi);
-
       counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
                             [has_second_ref(mbmi)]++;
@@ -833,15 +1313,33 @@
         }
       }
     }
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+      if (bsize >= BLOCK_8X8) {
+        const PREDICTION_MODE mode = mbmi->mode;
+        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          }
+        }
+      }
+    }
   }
 }
 
-static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -849,30 +1347,29 @@
   int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
-    vpx_memcpy(
+    memcpy(
         xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
-    vpx_memcpy(
+    memcpy(
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(xd->above_seg_context + mi_col, sa,
-             sizeof(*xd->above_seg_context) * mi_width);
-  vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(xd->above_seg_context + mi_col, sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
-static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                          BLOCK_SIZE bsize) {
-  const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -882,49 +1379,52 @@
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(
+    memcpy(
         a + num_4x4_blocks_wide * p,
         xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
-    vpx_memcpy(
+    memcpy(
         l + num_4x4_blocks_high * p,
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, xd->above_seg_context + mi_col,
-             sizeof(*xd->above_seg_context) * mi_width);
-  vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
-             sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+                     ThreadData *td,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize,
                      PICK_MODE_CONTEXT *ctx) {
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled);
-  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
-    update_stats(&cpi->common, &cpi->mb);
+    update_stats(&cpi->common, td);
 
     (*tp)->token = EOSB_TOKEN;
     (*tp)++;
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb(VP9_COMP *cpi, ThreadData *td,
+                      const TileInfo *const tile,
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize = bsize;
@@ -942,46 +1442,46 @@
 
   partition = partition_lookup[bsl][subsize];
   if (output_enabled && bsize != BLOCK_4X4)
-    cm->counts.partition[ctx][partition]++;
+    td->counts->partition[ctx][partition]++;
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->none);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-                 &pc_tree->vertical[1]);
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+                 subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-                 &pc_tree->horizontal[1]);
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+                 subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                  pc_tree->leaf_split[0]);
       } else {
-        encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   pc_tree->split[0]);
-        encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-                  pc_tree->split[1]);
-        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-                  pc_tree->split[2]);
-        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[1]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                  subsize, pc_tree->split[2]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
                   subsize, pc_tree->split[3]);
       }
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
@@ -1061,79 +1561,7 @@
   }
 }
 
-static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8,
-  MODE_INFO **prev_mi_8x8) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col];
-      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-
-      if (prev_mi) {
-        const ptrdiff_t offset = prev_mi - cm->prev_mi;
-        mi_8x8[block_row * mis + block_col] = cm->mi + offset;
-        mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
-      }
-    }
-  }
-}
-
-static void constrain_copy_partitioning(VP9_COMP *const cpi,
-                                        const TileInfo *const tile,
-                                        MODE_INFO **mi_8x8,
-                                        MODE_INFO **prev_mi_8x8,
-                                        int mi_row, int mi_col,
-                                        BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  int block_row, block_col;
-
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
-  // If the SB64 if it is all "in image".
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        const int index = block_row * mis + block_col;
-        MODE_INFO *prev_mi = prev_mi_8x8[index];
-        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-        // Use previous partition if block size is not larger than bsize.
-        if (prev_mi && sb_type <= bsize) {
-          int block_row2, block_col2;
-          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
-            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
-              const int index2 = (block_row + block_row2) * mis +
-                  block_col + block_col2;
-              prev_mi = prev_mi_8x8[index2];
-              if (prev_mi) {
-                const ptrdiff_t offset = prev_mi - cm->prev_mi;
-                mi_8x8[index2] = cm->mi + offset;
-                mi_8x8[index2]->mbmi.sb_type = prev_mi->mbmi.sb_type;
-              }
-            }
-          }
-        } else {
-          // Otherwise, use fixed partition of size bsize.
-          mi_8x8[index] = mi_upper_left + index;
-          mi_8x8[index]->mbmi.sb_type = bsize;
-        }
-      }
-    }
-  } else {
-    // Else this is a partial SB64, copy previous partition.
-    copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-  }
-}
-
-const struct {
+static const struct {
   int row;
   int col;
 } coord_lookup[16] = {
@@ -1149,10 +1577,10 @@
 
 static void set_source_var_based_partition(VP9_COMP *cpi,
                                            const TileInfo *const tile,
+                                           MACROBLOCK *const x,
                                            MODE_INFO **mi_8x8,
                                            int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
   const int mis = cm->mi_stride;
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -1173,7 +1601,7 @@
     int use32x32 = 0;
     unsigned int thr = cpi->source_var_thresh;
 
-    vpx_memset(d32, 0, 4 * sizeof(diff));
+    memset(d32, 0, 4 * sizeof(diff));
 
     for (i = 0; i < 4; i++) {
       diff *d16[4];
@@ -1233,88 +1661,66 @@
   }
 }
 
-static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
-                         int mi_row, int mi_col) {
-  MACROBLOCK *x = &cpi->mb;
-  uint8_t *src, *pre;
-  int src_stride, pre_stride;
-
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-
-  int this_sad = 0;
-  int threshold = 0;
-
-  // This assumes the input source frames are of the same dimension.
-  src_stride = cpi->Source->y_stride;
-  src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
-            (mi_col * MI_SIZE);
-  pre_stride = cpi->Last_Source->y_stride;
-  pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
-          (mi_col * MI_SIZE);
-
-  if (row8x8_remaining >= MI_BLOCK_SIZE &&
-      col8x8_remaining >= MI_BLOCK_SIZE) {
-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride);
-    threshold = (1 << 12);
-  } else {
-    int r, c;
-    for (r = 0; r < row8x8_remaining; r += 2)
-      for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
-                                                 pre, pre_stride);
-    threshold = (row8x8_remaining * col8x8_remaining) << 6;
-  }
-
-  x->in_static_area = (this_sad < 2 * threshold);
-  return x->in_static_area;
-}
-
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
-                         const int motion_thresh) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  if (cm->prev_mi) {
-    for (block_row = 0; block_row < 8; ++block_row) {
-      for (block_col = 0; block_col < 8; ++block_col) {
-        const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
-        if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
-            return 1;
-        }
-      }
-    }
-  }
-  return 0;
-}
-
-static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
+                            PICK_MODE_CONTEXT *ctx,
                             int mi_row, int mi_col, int bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
 
   *(xd->mi[0]) = ctx->mic;
+  *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  // For in frame adaptive Q, check for reseting the segment_id and updating
-  // the cyclic refresh map.
-  if ((cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) && seg->enabled) {
-    vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
-                                      mi_row, mi_col, bsize, 1);
+  if (seg->enabled && cpi->oxcf.aq_mode) {
+    // For in frame complexity AQ or variance AQ, copy segment_id from
+    // segmentation_map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ ||
+        cpi->oxcf.aq_mode == VARIANCE_AQ ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    } else {
+    // Setting segmentation map for cyclic_refresh.
+      vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip);
+    }
     vp9_init_plane_quantizers(cpi, x);
   }
 
   if (is_inter_block(mbmi)) {
-    vp9_update_mv_count(cm, xd);
-
+    vp9_update_mv_count(td);
     if (cm->interp_filter == SWITCHABLE) {
       const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
+      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+    }
+
+    if (mbmi->sb_type < BLOCK_8X8) {
+      mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+      mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    }
+  }
+
+  if (cm->use_prev_frame_mvs) {
+    MV_REF *const frame_mvs =
+        cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+    int w, h;
+
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      }
     }
   }
 
@@ -1322,36 +1728,40 @@
   x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
 }
 
-static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
+                        const TileInfo *const tile,
                         TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
-                     PICK_MODE_CONTEXT *ctx) {
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
+                        int output_enabled, BLOCK_SIZE bsize,
+                        PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) {
-    vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col,
+  if (cpi->oxcf.noise_sensitivity > 0 && output_enabled &&
+      cpi->common.frame_type != KEY_FRAME) {
+    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
                          MAX(BLOCK_8X8, bsize), ctx);
   }
 #endif
 
-  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
-  update_stats(&cpi->common, &cpi->mb);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_stats(&cpi->common, td);
 
   (*tp)->token = EOSB_TOKEN;
   (*tp)++;
 }
 
-static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
+                         const TileInfo *const tile,
                          TOKENEXTRA **tp, int mi_row, int mi_col,
                          int output_enabled, BLOCK_SIZE bsize,
                          PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   int ctx;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -1371,42 +1781,42 @@
 
   partition = partition_lookup[bsl][subsize];
   if (output_enabled && bsize != BLOCK_4X4)
-    cm->counts.partition[ctx][partition]++;
+    td->counts->partition[ctx][partition]++;
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->none);
       break;
     case PARTITION_VERT:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+        encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
                     subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+        encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
                     subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                    pc_tree->split[0]);
-      encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
                    subsize, pc_tree->split[1]);
-      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
                    subsize, pc_tree->split[2]);
-      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                   subsize, pc_tree->split[3]);
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
+                   output_enabled, subsize, pc_tree->split[3]);
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
@@ -1415,16 +1825,19 @@
 }
 
 static void rd_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO **mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                             int mi_row, int mi_col,
+                             BLOCK_SIZE bsize,
+                             int *rate, int64_t *dist,
                              int do_recon, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = cm->mi_stride;
-  const int bsl = b_width_log2(bsize);
+  const int bsl = b_width_log2_lookup[bsize];
   const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
   const int bss = (1 << bsl) / 4;
   int i, pl;
@@ -1432,15 +1845,7 @@
   BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
-  int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT64_MAX;
-  int64_t last_part_rd = INT64_MAX;
-  int none_rate = INT_MAX;
-  int64_t none_dist = INT64_MAX;
-  int64_t none_rd = INT64_MAX;
-  int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT64_MAX;
-  int64_t chosen_rd = INT64_MAX;
+  RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
@@ -1453,14 +1858,18 @@
   assert(num_4x4_blocks_wide_lookup[bsize] ==
          num_4x4_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_reset(&last_part_rdc);
+  vp9_rd_cost_reset(&none_rdc);
+  vp9_rd_cost_reset(&chosen_rdc);
+
   partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
   pc_tree->partitioning = partition;
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
@@ -1473,7 +1882,7 @@
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO * this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -1486,17 +1895,18 @@
         mi_row + (mi_step >> 1) < cm->mi_rows &&
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                       ctx, INT64_MAX, 0);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                       ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
-      if (none_rate < INT_MAX) {
-        none_rate += cpi->partition_cost[pl][PARTITION_NONE];
-        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
+                                 none_rdc.dist);
       }
 
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
@@ -1504,84 +1914,84 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, bsize, ctx, INT64_MAX, 0);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, subsize, &pc_tree->horizontal[0],
-                       INT64_MAX, 0);
-      if (last_part_rate != INT_MAX &&
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
-        int rt = 0;
-        int64_t dt = 0;
+        RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt,
-                         subsize, &pc_tree->horizontal[1], INT64_MAX, 1);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+        vp9_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                       &last_part_dist, subsize, &pc_tree->vertical[0],
-                       INT64_MAX, 0);
-      if (last_part_rate != INT_MAX &&
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->vertical[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
-        int rt = 0;
-        int64_t dt = 0;
+        RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
-        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt,
+        vp9_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
-                         INT64_MAX, 1);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
-                         &last_part_dist, subsize, pc_tree->leaf_split[0],
-                         INT64_MAX, 0);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                         subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
-      last_part_rate = 0;
-      last_part_dist = 0;
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
       for (i = 0; i < 4; i++) {
         int x_idx = (i & 1) * (mi_step >> 1);
         int y_idx = (i >> 1) * (mi_step >> 1);
         int jj = i >> 1, ii = i & 0x01;
-        int rt;
-        int64_t dt;
-
+        RD_COST tmp_rdc;
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
-                         mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
+        vp9_rd_cost_init(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize,
+                         &tmp_rdc.rate, &tmp_rdc.dist,
                          i != 3, pc_tree->split[i]);
-        if (rt == INT_MAX || dt == INT64_MAX) {
-          last_part_rate = INT_MAX;
-          last_part_dist = INT64_MAX;
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
           break;
         }
-        last_part_rate += rt;
-        last_part_dist += dt;
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
       }
       break;
     default:
@@ -1590,9 +2000,10 @@
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX) {
-    last_part_rate += cpi->partition_cost[pl][partition];
-    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                  last_part_rdc.rate, last_part_rdc.dist);
   }
 
   if (do_partition_search
@@ -1604,99 +2015,83 @@
       && (mi_col + mi_step < cm->mi_cols ||
           mi_col + (mi_step >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    chosen_rate = 0;
-    chosen_dist = 0;
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
     for (i = 0; i < 4; i++) {
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
-      int rt = 0;
-      int64_t dt = 0;
+      RD_COST tmp_rdc;
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                       split_subsize, &pc_tree->split[i]->none,
-                       INT64_MAX, i);
+      rd_pick_sb_modes(cpi, tile_data, x,
+                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT64_MAX) {
-        chosen_rate = INT_MAX;
-        chosen_dist = INT64_MAX;
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        vp9_rd_cost_reset(&chosen_rdc);
         break;
       }
 
-      chosen_rate += rt;
-      chosen_dist += dt;
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
 
       if (i != 3)
-        encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize, pc_tree->split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
-      chosen_rate += cpi->partition_cost[pl][PARTITION_NONE];
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    if (chosen_rate < INT_MAX) {
-      chosen_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 chosen_rdc.rate, chosen_rdc.dist);
     }
   }
 
   // If last_part is better set the partitioning to that.
-  if (last_part_rd < chosen_rd) {
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
     mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
-    chosen_rate = last_part_rate;
-    chosen_dist = last_part_dist;
-    chosen_rd = last_part_rd;
+    chosen_rdc = last_part_rdc;
   }
   // If none was better set the partitioning to that.
-  if (none_rd < chosen_rd) {
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = PARTITION_NONE;
-    chosen_rate = none_rate;
-    chosen_dist = none_dist;
+    chosen_rdc = none_rdc;
   }
 
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
+  if (bsize == BLOCK_64X64)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                    output_enabled, chosen_rate);
-    }
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              chosen_rate, chosen_dist);
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
 
-  *rate = chosen_rate;
-  *dist = chosen_dist;
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
@@ -1715,6 +2110,7 @@
   BLOCK_64X64
 };
 
+
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
 // At the moment this is designed to work on a 64x64 SB but could be
@@ -1724,7 +2120,8 @@
 // function so repeat calls can accumulate a min and max of more than one sb64.
 static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
                                         BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size ) {
+                                        BLOCK_SIZE *max_block_size,
+                                        int bs_hist[BLOCK_SIZES]) {
   int sb_width_in_blocks = MI_BLOCK_SIZE;
   int sb_height_in_blocks  = MI_BLOCK_SIZE;
   int i, j;
@@ -1733,8 +2130,9 @@
   // Check the sb_type for each block that belongs to this region.
   for (i = 0; i < sb_height_in_blocks; ++i) {
     for (j = 0; j < sb_width_in_blocks; ++j) {
-      MODE_INFO * mi = mi_8x8[index+j];
+      MODE_INFO *mi = mi_8x8[index+j];
       BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+      bs_hist[sb_type]++;
       *min_block_size = MIN(*min_block_size, sb_type);
       *max_block_size = MAX(*max_block_size, sb_type);
     }
@@ -1754,11 +2152,11 @@
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd,
                                     int mi_row, int mi_col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
@@ -1767,6 +2165,8 @@
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
   BLOCK_SIZE max_size = BLOCK_64X64;
+  int bs_hist[BLOCK_SIZES] = {0};
+
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
@@ -1777,21 +2177,24 @@
     // passed in values for min and max as a starting point.
     // Find the min and max partition used in previous frame at this location
     if (cm->frame_type != KEY_FRAME) {
-      MODE_INFO **const prev_mi =
+      MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
     }
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
       MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
     }
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
       MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
     }
-    // adjust observed min and max
+
+    // Adjust observed min and max for "relaxed" auto partition case.
     if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
       min_size = min_partition_size[min_size];
       max_size = max_partition_size[max_size];
@@ -1802,7 +2205,15 @@
   max_size = find_partition_size(max_size,
                                  row8x8_remaining, col8x8_remaining,
                                  &bh, &bw);
-  min_size = MIN(min_size, max_size);
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
+                   MIN(min_size, max_size));
+  }
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
@@ -1811,66 +2222,6 @@
       next_square_size[max_size] < min_size) {
      min_size = next_square_size[max_size];
   }
-  *min_block_size = min_size;
-  *max_block_size = max_size;
-}
-
-static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
-                                 int mi_row, int mi_col,
-                                 BLOCK_SIZE *min_block_size,
-                                 BLOCK_SIZE *max_block_size) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  const int left_in_image = xd->left_available && mi_8x8[-1];
-  const int above_in_image = xd->up_available &&
-                             mi_8x8[-xd->mi_stride];
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int bh, bw;
-  BLOCK_SIZE min_size = BLOCK_32X32;
-  BLOCK_SIZE max_size = BLOCK_8X8;
-  int bsl = mi_width_log2(BLOCK_64X64);
-  const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
-                       get_chessboard_index(cm->current_video_frame)) & 0x1;
-  // Trap case where we do not have a prediction.
-  if (search_range_ctrl &&
-      (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
-    int block;
-    MODE_INFO **mi;
-    BLOCK_SIZE sb_type;
-
-    // Find the min and max partition sizes used in the left SB64.
-    if (left_in_image) {
-      MODE_INFO *cur_mi;
-      mi = &mi_8x8[-1];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        cur_mi = mi[block * xd->mi_stride];
-        sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-    // Find the min and max partition sizes used in the above SB64.
-    if (above_in_image) {
-      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-
-    min_size = min_partition_size[min_size];
-    max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
-                                   &bh, &bw);
-    min_size = MIN(min_size, max_size);
-    min_size = MAX(min_size, BLOCK_8X8);
-    max_size = MIN(max_size, BLOCK_32X32);
-  } else {
-    min_size = BLOCK_8X8;
-    max_size = BLOCK_32X32;
-  }
 
   *min_block_size = min_size;
   *max_block_size = max_size;
@@ -1885,8 +2236,8 @@
   int idx, idy;
 
   MODE_INFO *mi;
-  MODE_INFO **prev_mi =
-      &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  const int idx_str = cm->mi_stride * mi_row + mi_col;
+  MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
   BLOCK_SIZE bs, min_size, max_size;
 
   min_size = BLOCK_64X64;
@@ -1931,11 +2282,11 @@
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
 
 static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
 #if CONFIG_FP_MB_STATS
@@ -1986,13 +2337,14 @@
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
-                              TOKENEXTRA **tp, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int *rate,
-                              int64_t *dist, int64_t best_rd,
-                              PC_TREE *pc_tree) {
+static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data,
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, RD_COST *rd_cost,
+                              int64_t best_rd, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2001,9 +2353,7 @@
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i, pl;
   BLOCK_SIZE subsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
 
@@ -2013,8 +2363,8 @@
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
-  BLOCK_SIZE min_size = cpi->sf.min_partition_size;
-  BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
 
 #if CONFIG_FP_MB_STATS
   unsigned int src_diff_var = UINT_MAX;
@@ -2031,7 +2381,12 @@
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  vp9_rd_cost_init(&this_rdc);
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
@@ -2059,23 +2414,12 @@
     partition_vert_allowed &= force_vert_split;
   }
 
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-
-  if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
-    unsigned int source_variancey;
-    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
-    source_variancey = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-    if (source_variancey < cpi->sf.disable_split_var_thresh) {
-      do_split = 0;
-      if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
-        do_rect = 0;
-    }
-  }
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
                                                   mi_row, mi_col, bsize);
   }
 #endif
@@ -2133,33 +2477,38 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                     ctx, best_rd, 0);
-    if (this_rate != INT_MAX) {
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                     &this_rdc, bsize, ctx, best_rdc.rdcost);
+    if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 this_rdc.rate, this_rdc.dist);
       }
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
 
-      if (sum_rd < best_rd) {
-        int64_t stop_thresh = 4096;
-        int64_t stop_thresh_rd;
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+        int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust threshold according to partition size.
-        stop_thresh >>= 8 - (b_width_log2(bsize) +
-            b_height_log2(bsize));
+        // Adjust dist breakout threshold according to the partition size.
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
 
-        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
-        // If obtained distortion is very small, choose current partition
-        // and stop splitting.
-        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless &&
+            (ctx->skippable && best_rdc.dist < dist_breakout_thr &&
+            best_rdc.rate < rate_breakout_thr)) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2198,9 +2547,9 @@
           }
           if (skip) {
             if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile, mi_row, mi_col, bsize);
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
               src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
             }
             if (src_diff_var < 8) {
               do_split = 0;
@@ -2211,7 +2560,7 @@
 #endif
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // store estimated motion vector
@@ -2219,7 +2568,6 @@
     store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
   if (do_split) {
@@ -2229,14 +2577,12 @@
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                       pc_tree->leaf_split[0], best_rd, 0);
-      if (sum_rate == INT_MAX)
-        sum_rd = INT64_MAX;
-      else
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                       pc_tree->leaf_split[0], best_rdc.rdcost);
+      if (sum_rdc.rate == INT_MAX)
+        sum_rdc.rdcost = INT64_MAX;
     } else {
-      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * mi_step;
       const int y_idx = (i >> 1) * mi_step;
 
@@ -2247,29 +2593,30 @@
           load_pred_mv(x, ctx);
 
         pc_tree->split[i]->index = i;
-        rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                          subsize, &this_rate, &this_dist,
-                          best_rd - sum_rd, pc_tree->split[i]);
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rdc,
+                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-        if (this_rate == INT_MAX) {
-          sum_rd = INT64_MAX;
+        if (this_rdc.rate == INT_MAX) {
+          sum_rdc.rdcost = INT64_MAX;
+          break;
         } else {
-          sum_rate += this_rate;
-          sum_dist += this_dist;
-          sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+          sum_rdc.rate += this_rdc.rate;
+          sum_rdc.dist += this_rdc.dist;
+          sum_rdc.rdcost += this_rdc.rdcost;
         }
       }
     }
 
-    if (sum_rd < best_rd && i == 4) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
 
-      if (sum_rd < best_rd) {
-        best_rate = sum_rate;
-        best_dist = sum_dist;
-        best_rd = sum_rd;
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else {
@@ -2278,26 +2625,27 @@
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // PARTITION_HORZ
-  if (partition_horz_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_HORZ);
+  if (partition_horz_allowed &&
+      (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                     &pc_tree->horizontal[0], best_rd, 0);
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->horizontal[0], best_rdc.rdcost);
 
-    if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+        bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
 
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, ctx);
@@ -2305,33 +2653,33 @@
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate,
-                       &this_dist, subsize, &pc_tree->horizontal[1],
-                       best_rd - sum_rd, 1);
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc, subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
-    if (sum_rd < best_rd) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-      if (sum_rd < best_rd) {
-        best_rd = sum_rd;
-        best_rate = sum_rate;
-        best_dist = sum_dist;
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
   // PARTITION_VERT
-  if (partition_vert_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_VERT);
+  if (partition_vert_allowed &&
+      (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
@@ -2339,12 +2687,12 @@
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                     &pc_tree->vertical[0], best_rd, 0);
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) {
-      update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->vertical[0], best_rdc.rdcost);
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+        bsize > BLOCK_8X8) {
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
 
       if (cpi->sf.adaptive_motion_search)
@@ -2353,30 +2701,29 @@
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate,
-                       &this_dist, subsize,
-                       &pc_tree->vertical[1], best_rd - sum_rd,
-                       1);
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
+                       &this_rdc, subsize,
+                       &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
-    if (sum_rd < best_rd) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rate += cpi->partition_cost[pl][PARTITION_VERT];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-      if (sum_rd < best_rd) {
-        best_rate = sum_rate;
-        best_dist = sum_dist;
-        best_rd = sum_rd;
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // TODO(jbb): This code added so that we avoid static analysis
@@ -2384,149 +2731,112 @@
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void) best_rd;
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) {
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
-
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+              bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             int mi_row,
+                             TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
-  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
+    const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
-
+    RD_COST dummy_rdc;
     int i;
+    int seg_skip = 0;
+
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i)
-        cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
       for (i = 0; i < 64; ++i) {
-        cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
       }
     }
 
-    vp9_zero(cpi->mb.pred_mv);
-    cpi->pc_root->index = 0;
+    vp9_zero(x->pred_mv);
+    td->pc_root->index = 0;
 
-    if ((sf->partition_search_type == SEARCH_PARTITION &&
-         sf->use_lastframe_partitioning) ||
-         sf->partition_search_type == FIXED_PARTITION ||
-         sf->partition_search_type == VAR_BASED_PARTITION ||
-         sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
-      const int idx_str = cm->mi_stride * mi_row + mi_col;
-      MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-      MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
-      cpi->mb.source_variance = UINT_MAX;
-      if (sf->partition_search_type == FIXED_PARTITION) {
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
-                               sf->always_this_block_size);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (cpi->skippable_frame ||
-                 sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
-        BLOCK_SIZE bsize;
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
-        choose_partitioning(cpi, tile, mi_row, mi_col);
-        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else {
-        GF_GROUP * gf_grp = &cpi->twopass.gf_group;
-        int last_was_mid_sequence_overlay = 0;
-        if ((cpi->oxcf.pass == 2) && (gf_grp->index)) {
-          if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
-            last_was_mid_sequence_overlay = 1;
-        }
-        if ((cpi->rc.frames_since_key
-            % sf->last_partitioning_redo_frequency) == 0
-            || last_was_mid_sequence_overlay
-            || cm->prev_mi == 0
-            || cm->show_frame == 0
-            || cm->frame_type == KEY_FRAME
-            || cpi->rc.is_src_frame_alt_ref
-            || ((sf->use_lastframe_partitioning ==
-                 LAST_FRAME_PARTITION_LOW_MOTION) &&
-                 sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
-          // If required set upper and lower partition size limits
-          if (sf->auto_min_max_partition_size) {
-            set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-            rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                    &sf->min_partition_size,
-                                    &sf->max_partition_size);
-          }
-          rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                            &dummy_rate, &dummy_dist, INT64_MAX,
-                            cpi->pc_root);
-        } else {
-          if (sf->constrain_copy_partition &&
-              sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
-            constrain_copy_partitioning(cpi, tile, mi, prev_mi,
-                                        mi_row, mi_col, BLOCK_16X16);
-          else
-            copy_partitioning(cm, mi, prev_mi);
-          rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                           &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-        }
-      }
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      const BLOCK_SIZE bsize =
+          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME) {
+      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                &sf->min_partition_size,
-                                &sf->max_partition_size);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size,
+                                &x->max_partition_size);
       }
-      rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rdc, INT64_MAX, td->pc_root);
     }
   }
 }
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -2538,17 +2848,17 @@
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(xd->above_context[0], 0,
-             sizeof(*xd->above_context[0]) *
-             2 * aligned_mi_cols * MAX_MB_PLANE);
-  vpx_memset(xd->above_seg_context, 0,
-             sizeof(*xd->above_seg_context) * aligned_mi_cols);
+  memset(xd->above_context[0], 0,
+         sizeof(*xd->above_context[0]) *
+         2 * aligned_mi_cols * MAX_MB_PLANE);
+  memset(xd->above_seg_context, 0,
+         sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
 
-  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
     return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
@@ -2580,11 +2890,12 @@
     return LAST_FRAME;
 }
 
-static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->mb.e_mbd.lossless)
+static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
+  if (xd->lossless)
     return ONLY_4X4;
-  if (cpi->common.frame_type == KEY_FRAME)
-    return TX_MODE_SELECT;
+  if (cpi->common.frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode)
+    return ALLOW_16X16;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
@@ -2594,37 +2905,59 @@
     return cpi->common.tx_mode;
 }
 
-static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, int mi_col,
-                                int *rate, int64_t *dist,
+static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
+                                     RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx) {
+  if (bsize < BLOCK_16X16)
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static void nonrd_pick_sb_modes(VP9_COMP *cpi,
+                                TileDataEnc *tile_data, MACROBLOCK *const x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
-    if (mbmi->segment_id && x->in_static_area)
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+  if (cm->frame_type == KEY_FRAME)
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+  else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+  else if (bsize >= BLOCK_8X8)
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
+                        rd_cost, bsize, ctx);
   else
-    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx);
+    vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col,
+                               rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+  if (rd_cost->rate == INT_MAX)
+    vp9_rd_cost_reset(rd_cost);
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
 }
 
 static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
                               int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, BLOCK_SIZE subsize,
+                              BLOCK_SIZE bsize,
                               PC_TREE *pc_tree) {
   MACROBLOCKD *xd = &x->e_mbd;
-  int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
 
   assert(bsize >= BLOCK_8X8);
 
@@ -2633,41 +2966,44 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
       *(xd->mi[0]) = pc_tree->none.mic;
+      *(x->mbmi_ext) = pc_tree->none.mbmi_ext;
       duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
       break;
     case PARTITION_VERT:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
       *(xd->mi[0]) = pc_tree->vertical[0].mic;
-      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      *(x->mbmi_ext) = pc_tree->vertical[0].mbmi_ext;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
 
       if (mi_col + hbs < cm->mi_cols) {
-        set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col + hbs);
         *(xd->mi[0]) = pc_tree->vertical[1].mic;
-        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
+        *(x->mbmi_ext) = pc_tree->vertical[1].mbmi_ext;
+        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize);
       }
       break;
     case PARTITION_HORZ:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
       *(xd->mi[0]) = pc_tree->horizontal[0].mic;
-      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      *(x->mbmi_ext) = pc_tree->horizontal[0].mbmi_ext;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
       if (mi_row + hbs < cm->mi_rows) {
-        set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
+        set_mode_info_offsets(cm, x, xd, mi_row + hbs, mi_col);
         *(xd->mi[0]) = pc_tree->horizontal[1].mic;
-        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
+        *(x->mbmi_ext) = pc_tree->horizontal[1].mbmi_ext;
+        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize);
       }
       break;
     case PARTITION_SPLIT: {
-      BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT);
-      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize,
-                        subsubsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
       fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
-                        subsubsize, pc_tree->split[1]);
+                        pc_tree->split[1]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
-                        subsubsize, pc_tree->split[2]);
+                        pc_tree->split[2]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
-                        subsubsize, pc_tree->split[3]);
+                        pc_tree->split[3]);
       break;
     }
     default:
@@ -2675,24 +3011,39 @@
   }
 }
 
-static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->none.pred_pixel_ready = 0;
+  pc_tree->horizontal[0].pred_pixel_ready = 0;
+  pc_tree->horizontal[1].pred_pixel_ready = 0;
+  pc_tree->vertical[0].pred_pixel_ready = 0;
+  pc_tree->vertical[1].pred_pixel_ready = 0;
+
+  if (bsize > BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    int i;
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->split[i], subsize);
+  }
+}
+
+static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                                 TileDataEnc *tile_data,
                                  TOKENEXTRA **tp, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize, int *rate,
-                                 int64_t *dist, int do_recon, int64_t best_rd,
+                                 int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+                                 int do_recon, int64_t best_rd,
                                  PC_TREE *pc_tree) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i;
   BLOCK_SIZE subsize = bsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
@@ -2711,54 +3062,62 @@
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (sf->auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= sf->max_partition_size &&
-                               bsize >= sf->min_partition_size);
-    partition_horz_allowed &= ((bsize <= sf->max_partition_size &&
-                                bsize > sf->min_partition_size) ||
+    partition_none_allowed &= (bsize <= x->max_partition_size &&
+                               bsize >= x->min_partition_size);
+    partition_horz_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
                                 force_horz_split);
-    partition_vert_allowed &= ((bsize <= sf->max_partition_size &&
-                                bsize > sf->min_partition_size) ||
+    partition_vert_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
                                 force_vert_split);
-    do_split &= bsize > sf->min_partition_size;
+    do_split &= bsize > x->min_partition_size;
   }
   if (sf->use_square_partition_only) {
     partition_horz_allowed &= force_horz_split;
     partition_vert_allowed &= force_vert_split;
   }
 
+  ctx->pred_pixel_ready = !(partition_vert_allowed ||
+                            partition_horz_allowed ||
+                            do_split);
+
   // PARTITION_NONE
   if (partition_none_allowed) {
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, bsize, ctx);
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                        &this_rdc, bsize, ctx);
     ctx->mic.mbmi = xd->mi[0]->mbmi;
+    ctx->mbmi_ext = *x->mbmi_ext;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
 
-    if (this_rate != INT_MAX) {
+    if (this_rdc.rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      this_rate += cpi->partition_cost[pl][PARTITION_NONE];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
-      if (sum_rd < best_rd) {
-        int64_t stop_thresh = 4096;
-        int64_t stop_thresh_rd;
+      this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust threshold according to partition size.
-        stop_thresh >>= 8 - (b_width_log2(bsize) +
-            b_height_log2(bsize));
-
-        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
-        // If obtained distortion is very small, choose current partition
-        // and stop splitting.
-        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+        if (!x->e_mbd.lossless &&
+            this_rdc.rate < rate_breakout_thr &&
+            this_rdc.dist < dist_breakout_thr) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2770,35 +3129,34 @@
   store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   if (do_split) {
     int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
     subsize = get_subsize(bsize, PARTITION_SPLIT);
-    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+    for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * ms;
       const int y_idx = (i >> 1) * ms;
 
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                           subsize, &this_rate, &this_dist, 0,
-                           best_rd - sum_rd, pc_tree->split[i]);
+      nonrd_pick_partition(cpi, td, tile_data, tp,
+                           mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rdc, 0,
+                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
 
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_SPLIT;
     } else {
       // skip rectangular partition test when larger block size
@@ -2813,144 +3171,271 @@
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    pc_tree->horizontal[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+    pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
 
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
-                          &this_rate, &this_dist, subsize,
+      pc_tree->horizontal[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col,
+                          &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
 
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rd = sum_rd;
-      best_rate = sum_rate;
-      best_dist = sum_dist;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_HORZ;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
 
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
-
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    pc_tree->vertical[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+    pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
-                          &this_rate, &this_dist, subsize,
+      pc_tree->vertical[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms,
+                          &this_rdc, subsize,
                           &pc_tree->vertical[1]);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_VERT];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
-  // TODO(JBB): The following line is here just to avoid a static warning
-  // that occurs because at this point we never again reuse best_rd
-  // despite setting it here.  The code should be refactored to avoid this.
-  (void) best_rd;
 
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate == INT_MAX)
+  if (best_rdc.rate == INT_MAX) {
+    vp9_rd_cost_reset(rd_cost);
     return;
+  }
 
   // update mode info array
-  subsize = get_subsize(bsize, pc_tree->partitioning);
-  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
-                    pc_tree);
+  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree);
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
-    }
-
-    if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
-
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                 bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == BLOCK_64X64 && do_recon) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
+static void nonrd_select_partition(VP9_COMP *cpi,
+                                   ThreadData *td,
+                                   TileDataEnc *tile_data,
+                                   MODE_INFO **mi,
+                                   TOKENEXTRA **tp,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, int output_enabled,
+                                   RD_COST *rd_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc;
+
+  vp9_rd_cost_reset(&this_rdc);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_16X16;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+             subsize >= BLOCK_16X16) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+    x->max_partition_size = BLOCK_16X16;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        pc_tree->none.pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->none);
+        pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->none.mbmi_ext = *x->mbmi_ext;
+        pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->none.skip = x->skip;
+        break;
+      case PARTITION_VERT:
+        pc_tree->vertical[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->vertical[0]);
+        pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+        pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[0].skip = x->skip;
+        if (mi_col + hbs < cm->mi_cols) {
+          pc_tree->vertical[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                              &this_rdc, subsize, &pc_tree->vertical[1]);
+          pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+          pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->vertical[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_HORZ:
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->horizontal[0]);
+        pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+        pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[0].skip = x->skip;
+        if (mi_row + hbs < cm->mi_rows) {
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                              &this_rdc, subsize, &pc_tree->horizontal[1]);
+          pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+          pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->horizontal[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_SPLIT:
+        subsize = get_subsize(bsize, PARTITION_SPLIT);
+        nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                               subsize, output_enabled, rd_cost,
+                               pc_tree->split[0]);
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp,
+                               mi_row, mi_col + hbs, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[1]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                               mi_row + hbs, mi_col, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[2]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                               mi_row + hbs, mi_col + hbs, subsize,
+                               output_enabled, &this_rdc, pc_tree->split[3]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        break;
+      default:
+        assert(0 && "Invalid partition type.");
+        break;
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && output_enabled)
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+}
+
+
 static void nonrd_use_partition(VP9_COMP *cpi,
-                                const TileInfo *const tile,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
                                 MODE_INFO **mi,
                                 TOKENEXTRA **tp,
                                 int mi_row, int mi_col,
                                 BLOCK_SIZE bsize, int output_enabled,
-                                int *totrate, int64_t *totdist,
-                                PC_TREE *pc_tree) {
+                                RD_COST *dummy_cost, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -2958,158 +3443,190 @@
   subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
+  if (output_enabled && bsize != BLOCK_4X4) {
+    int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    td->counts->partition[ctx][partition]++;
+  }
+
   switch (partition) {
     case PARTITION_NONE:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->none.pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->none);
       pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->none.mbmi_ext = *x->mbmi_ext;
       pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->none);
       break;
     case PARTITION_VERT:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->vertical[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->vertical[0]);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
-      if (mi_col + hbs < cm->mi_cols) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
-                            &rate, &dist, subsize, &pc_tree->vertical[1]);
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        pc_tree->vertical[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            dummy_cost, subsize, &pc_tree->vertical[1]);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
         pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
-        }
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
+                    output_enabled, subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->horizontal[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->horizontal[0]);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
-      if (mi_row + hbs < cm->mi_rows) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
-                            &rate, &dist, subsize, &pc_tree->horizontal[0]);
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->horizontal[0]);
+
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        pc_tree->horizontal[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            dummy_cost, subsize, &pc_tree->horizontal[1]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
-        }
+        encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
+                    output_enabled, subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-      nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                          subsize, output_enabled, totrate, totdist,
-                          pc_tree->split[0]);
-      nonrd_use_partition(cpi, tile, mi + hbs, tp,
-                          mi_row, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[1]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
-      }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
-                          mi_row + hbs, mi_col, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[2]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
-      }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
-                          mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[3]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+      if (bsize == BLOCK_8X8) {
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                            subsize, pc_tree->leaf_split[0]);
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col,
+                    output_enabled, subsize, pc_tree->leaf_split[0]);
+      } else {
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            subsize, output_enabled, dummy_cost,
+                            pc_tree->split[0]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
+                            mi_row, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[1]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                            mi_row + hbs, mi_col, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[2]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                            mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[3]);
       }
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
-  if (bsize == BLOCK_64X64 && output_enabled) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              *totrate, *totdist);
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
-  }
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, TOKENEXTRA **tp) {
+static void encode_nonrd_sb_row(VP9_COMP *cpi,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
+                                int mi_row,
+                                TOKENEXTRA **tp) {
   SPEED_FEATURES *const sf = &cpi->sf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
-  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate = 0;
-    int64_t dummy_dist = 0;
+    const struct segmentation *const seg = &cm->seg;
+    RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
-    MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
-    BLOCK_SIZE bsize;
-
-    x->in_static_area = 0;
+    PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
+    BLOCK_SIZE bsize = BLOCK_64X64;
+    int seg_skip = 0;
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
+    vp9_rd_cost_init(&dummy_rdc);
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
 
     // Set the partition type of the 64X64 block
-    switch (sf->partition_search_type) {
+    switch (partition_search_type) {
       case VAR_BASED_PARTITION:
-        choose_partitioning(cpi, tile, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        // TODO(jingning, marpan): The mode decision and encoding process
+        // support both intra and inter sub8x8 block coding for RTC mode.
+        // Tune the thresholds accordingly to use sub8x8 block coding for
+        // coding performance improvement.
+        choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case SOURCE_VAR_BASED_PARTITION:
-        set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
-      case VAR_BASED_FIXED_PARTITION:
       case FIXED_PARTITION:
-        bsize = sf->partition_search_type == FIXED_PARTITION ?
-                sf->always_this_block_size :
-                get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        if (!seg_skip)
+          bsize = sf->always_this_block_size;
+        set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case REFERENCE_PARTITION:
-        if (sf->partition_check ||
-            !is_background(cpi, tile, mi_row, mi_col)) {
-          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-          auto_partition_range(cpi, tile, mi_row, mi_col,
-                               &sf->min_partition_size,
-                               &sf->max_partition_size);
-          nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                               &dummy_rate, &dummy_dist, 1, INT64_MAX,
-                               cpi->pc_root);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0]->mbmi.segment_id) {
+          // Use lower max_partition_size for low resoultions.
+          if (cm->width <= 352 && cm->height <= 288)
+            x->max_partition_size = BLOCK_32X32;
+          else
+            x->max_partition_size = BLOCK_64X64;
+          x->min_partition_size = BLOCK_8X8;
+          nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                               BLOCK_64X64, &dummy_rdc, 1,
+                               INT64_MAX, td->pc_root);
         } else {
-          copy_partitioning(cm, mi, prev_mi);
-          nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
-                              cpi->pc_root);
+          choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+          // TODO(marpan): Seems like nonrd_select_partition does not support
+          // 4x4 partition. Since 4x4 is used on key frame, use this switch
+          // for now.
+          if (cm->frame_type == KEY_FRAME)
+            nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+          else
+            nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                   BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         }
+
         break;
       default:
         assert(0);
@@ -3132,19 +3649,44 @@
   const int cutoff = (MIN(cm->width, cm->height) >= 720) ?
       (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
       (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
-  DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS);
+  DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
   diff *var16 = cpi->source_diff_var;
 
   int sum = 0;
   int i, j;
 
-  vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+  memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
 
   for (i = 0; i < cm->mb_rows; i++) {
     for (j = 0; j < cm->mb_cols; j++) {
-      vp9_get16x16var(src, src_stride, last_src, last_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
+                                   &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_10:
+            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+                                    &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_12:
+            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+                                      &var16->sse, &var16->sum);
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
+                   " or VPX_BITS_12");
+            return -1;
+        }
+      } else {
+        vpx_get16x16var(src, src_stride, last_src, last_stride,
+                        &var16->sse, &var16->sum);
+      }
+#else
+      vpx_get16x16var(src, src_stride, last_src, last_stride,
                       &var16->sse, &var16->sum);
-
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       var16->var = var16->sse -
           (((uint32_t)var16->sum * var16->sum) >> 8);
 
@@ -3192,9 +3734,9 @@
       if (cpi->source_diff_var)
         vpx_free(cpi->source_diff_var);
 
-        CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                        vpx_calloc(cm->MBs, sizeof(diff)));
-      }
+      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                      vpx_calloc(cm->MBs, sizeof(diff)));
+    }
 
     if (!cpi->frames_till_next_var_check)
       cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
@@ -3206,13 +3748,13 @@
   }
 }
 
-static int get_skip_encode_frame(const VP9_COMMON *cm) {
+static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
   unsigned int intra_count = 0, inter_count = 0;
   int j;
 
   for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
-    intra_count += cm->counts.intra_inter[j][0];
-    inter_count += cm->counts.intra_inter[j][1];
+    intra_count += td->counts->intra_inter[j][0];
+    inter_count += td->counts->intra_inter[j][1];
   }
 
   return (intra_count << 2) < inter_count &&
@@ -3220,33 +3762,84 @@
          cm->show_frame;
 }
 
-static void encode_tiles(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
+void vp9_init_tile_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
-  TOKENEXTRA *tok = cpi->tok;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  int tile_tok = 0;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL)
+      vpx_free(cpi->tile_data);
+    CHECK_MEM_ERROR(cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        int i, j;
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->mode_map[i][j] = j;
+          }
+        }
+      }
+  }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo tile;
-      TOKENEXTRA *old_tok = tok;
-      int mi_row;
+      TileInfo *tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
-      for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
-           mi_row += MI_BLOCK_SIZE) {
-        if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
-          encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
-        else
-          encode_rd_sb_row(cpi, &tile, mi_row, &tok);
-      }
-      cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok);
-      assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(*tile_info);
     }
   }
 }
 
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
+                     int tile_row, int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile =
+      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo * const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    if (cpi->sf.use_nonrd_pick_mode)
+      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+    else
+      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+      allocated_tokens(*tile_info));
+}
+
+static void encode_tiles(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+
+  vp9_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+      vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
 #if CONFIG_FP_MB_STATS
 static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
                             VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
@@ -3264,44 +3857,56 @@
 
 static void encode_frame_internal(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
-  RD_OPT *const rd_opt = &cpi->rd;
-  MACROBLOCK *const x = &cpi->mb;
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
-  vp9_zero(cm->counts);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(cpi->tx_stepdown_count);
-  vp9_zero(rd_opt->comp_pred_diff);
-  vp9_zero(rd_opt->filter_diff);
-  vp9_zero(rd_opt->tx_select_diff);
-  vp9_zero(rd_opt->tx_select_threshes);
+  vp9_zero(*td->counts);
+  vp9_zero(rdc->coef_counts);
+  vp9_zero(rdc->comp_pred_diff);
+  vp9_zero(rdc->filter_diff);
 
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  cm->tx_mode = select_tx_mode(cpi);
-
-  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
+                                      vp9_highbd_idct4x4_add;
+#else
+  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
-  if (xd->lossless) {
+  if (xd->lossless)
     x->optimize = 0;
-    cm->lf.filter_level = 0;
-    cpi->zbin_mode_boost_enabled = 0;
-  }
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
 
   vp9_frame_init_quantizer(cpi);
 
   vp9_initialize_rd_consts(cpi);
-  vp9_initialize_me_consts(cpi, cm->base_qindex);
+  vp9_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
-  set_prev_mi(cm);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->use_prev_frame_mvs ?
+                cm->prev_mip + cm->mi_stride + 1 : NULL;
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp9_zero(x->skip_txfm);
@@ -3311,7 +3916,7 @@
     int i;
     struct macroblock_plane *const p = x->plane;
     struct macroblockd_plane *const pd = xd->plane;
-    PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+    PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
 
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -3321,6 +3926,11 @@
     }
     vp9_zero(x->zcoeff_blk);
 
+    if (cm->frame_type != KEY_FRAME &&
+        cpi->rc.frames_since_golden == 0 &&
+        !cpi->use_svc)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
     if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
       source_var_based_partition_search_method(cpi);
   }
@@ -3336,13 +3946,18 @@
   }
 #endif
 
-    encode_tiles(cpi);
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+      vp9_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
-  sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0;
+  sf->skip_encode_frame = sf->skip_encode_sb ?
+      get_skip_encode_frame(cm, td) : 0;
 
 #if 0
   // Keep record of the total distortion this time around for future use
@@ -3369,7 +3984,6 @@
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -3382,9 +3996,9 @@
              cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
              cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cm->allow_comp_inter_inter = 0;
+      cpi->allow_comp_inter_inter = 0;
     } else {
-      cm->allow_comp_inter_inter = 1;
+      cpi->allow_comp_inter_inter = 1;
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
       cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3393,6 +4007,9 @@
 
   if (cpi->sf.frame_parameter_update) {
     int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     // This code does a single RD pass over the whole frame assuming
     // either compound, single or hybrid prediction as per whatever has
@@ -3404,11 +4021,10 @@
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
     int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
-    int *const tx_thrs = rd_opt->tx_select_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (is_alt_ref || !cm->allow_comp_inter_inter)
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] >
@@ -3427,33 +4043,26 @@
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
-      mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2;
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2;
-
-    for (i = 0; i < TX_MODES; ++i) {
-      int64_t pd = rd_opt->tx_select_diff[i];
-      if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
-      tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2;
-    }
+      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
 
       for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
-        single_count_zero += cm->counts.comp_inter[i][0];
-        comp_count_zero += cm->counts.comp_inter[i][1];
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
       }
 
       if (comp_count_zero == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
-        vp9_zero(cm->counts.comp_inter);
+        vp9_zero(counts->comp_inter);
       } else if (single_count_zero == 0) {
         cm->reference_mode = COMPOUND_REFERENCE;
-        vp9_zero(cm->counts.comp_inter);
+        vp9_zero(counts->comp_inter);
       }
     }
 
@@ -3464,19 +4073,18 @@
       int count32x32 = 0;
 
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
-        count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
-        count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
+        count4x4 += counts->tx.p32x32[i][TX_4X4];
+        count4x4 += counts->tx.p16x16[i][TX_4X4];
+        count4x4 += counts->tx.p8x8[i][TX_4X4];
 
-        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
-        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
+        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
 
-        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
-        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
-        count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+        count32x32 += counts->tx.p32x32[i][TX_32X32];
       }
-
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
           count32x32 == 0) {
         cm->tx_mode = ALLOW_8X8;
@@ -3517,35 +4125,18 @@
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
-static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
-  if (enabled) {
-    if (is_inter_block(mbmi)) {
-      if (mbmi->mode == ZEROMV) {
-        return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
-                                                : LF_ZEROMV_ZBIN_BOOST;
-      } else {
-        return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
-                                         : MV_ZBIN_BOOST;
-      }
-    } else {
-      return INTRA_ZBIN_BOOST;
-    }
-  } else {
-    return 0;
-  }
-}
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
+                              TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
-  const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                             SEG_LVL_SKIP);
+  const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id,
+                                         SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -3556,7 +4147,7 @@
                    cpi->sf.allow_skip_recode;
 
   if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
-    vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+    memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
 
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
@@ -3567,54 +4158,40 @@
   if (x->skip_encode)
     return;
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
-  // Experimental code. Special case for gf and arf zeromv modes.
-  // Increase zbin size to suppress noise
-  cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
-                                             cpi->zbin_mode_boost_enabled);
-  vp9_update_zbin_extra(cpi, x);
-
   if (!is_inter_block(mbmi)) {
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
-      sum_intra_stats(&cm->counts, mi);
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+      sum_intra_stats(td->counts, mi);
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
                                                      mbmi->ref_frame[ref]);
+      assert(cfg != NULL);
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
-    if (!x->skip) {
-      mbmi->skip = 1;
-      vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-      vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-    } else {
-      mbmi->skip = 1;
-      if (output_enabled && !seg_skip)
-        cm->counts.skip[vp9_get_skip_context(xd)][1]++;
-      reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
-    }
+    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
-      ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
-                      &cm->counts.tx)[mbmi->tx_size];
+      ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
+                      &td->counts->tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
@@ -3631,5 +4208,7 @@
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
             mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
+    ++td->counts->tx.tx_totals[mbmi->tx_size];
+    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
   }
 }

diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index fd1c9aa..6aaa564 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h

@@ -12,6 +12,8 @@
 #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
 #define VP9_ENCODER_VP9_ENCODEFRAME_H_
 
+#include "vpx/vpx_integer.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -19,6 +21,7 @@
 struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;
+struct ThreadData;
 
 // Constants used in SOURCE_VAR_BASED_PARTITION
 #define VAR_HIST_MAX_BG_VAR 1000
@@ -33,6 +36,12 @@
 
 void vp9_encode_frame(struct VP9_COMP *cpi);
 
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
+                     int tile_row, int tile_col);
+
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 8a737e1..00e4c61 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c

@@ -11,16 +11,18 @@
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -29,28 +31,6 @@
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
-struct encode_b_args {
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-  int8_t *skip;
-};
-
-void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff, ptrdiff_t diff_stride,
-                          const uint8_t *src, ptrdiff_t src_stride,
-                          const uint8_t *pred, ptrdiff_t pred_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src  += src_stride;
-  }
-}
-
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -58,7 +38,15 @@
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
 
-  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
 
@@ -68,8 +56,8 @@
   int           rate;
   int           error;
   int           next;
-  signed char   token;
-  short         qc;
+  int16_t       token;
+  int16_t       qc;
 } vp9_token_state;
 
 // TODO(jimbankoski): experiment to find optimal RD numbers.
@@ -107,9 +95,9 @@
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   uint8_t token_cache[1024];
-  const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
@@ -122,8 +110,15 @@
   int next = eob, sz = 0;
   int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
+  int rate0, rate1, error0, error1;
+  int16_t t0, t1;
+  EXTRABIT e0;
   int best, band, pt, i, final_eob;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -142,7 +137,7 @@
 
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
-        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];
+        vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
@@ -156,7 +151,7 @@
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->token;
+      vp9_get_token_extra(x, &t0, &e0);
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -169,8 +164,13 @@
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
       dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -202,8 +202,10 @@
          */
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        e0 = 0;
       } else {
-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
+        vp9_get_token_extra(x, &t0, &e0);
+        t1 = t0;
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -222,10 +224,18 @@
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = vp9_dct_value_cost_ptr[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 
       if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        }
+#else
         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         d2 = dx * dx;
       }
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
@@ -272,8 +282,8 @@
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
-  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
@@ -294,22 +304,33 @@
 }
 
 static INLINE void fdct32x32(int rd_transform,
-                             const int16_t *src, int16_t *dst, int src_stride) {
+                             const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
   if (rd_transform)
-    vp9_fdct32x32_rd(src, dst, src_stride);
+    vpx_fdct32x32_rd(src, dst, src_stride);
   else
-    vp9_fdct32x32(src, dst, src_stride);
+    vpx_fdct32x32(src, dst, src_stride);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+                                    tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_highbd_fdct32x32(src, dst, src_stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -317,33 +338,72 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
       break;
     case TX_16X16:
-      vp9_fdct16x16(src_diff, coeff, diff_stride);
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      vp9_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
-                      scan_order->scan, scan_order->iscan);
+      vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+                        x->skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                        pd->dequant, eob,
+                        scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -357,9 +417,9 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -368,28 +428,62 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
-      vp9_fdct32x32_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
+      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
                             p->quant_fp[0], qcoeff, dqcoeff,
                             pd->dequant[0], eob);
       break;
     case TX_16X16:
-      vp9_fdct16x16_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
                      p->quant_fp[0], qcoeff, dqcoeff,
                      pd->dequant[0], eob);
       break;
     case TX_8X8:
-      vp9_fdct8x8_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
@@ -405,9 +499,9 @@
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -415,33 +509,71 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           pd->dequant, eob, scan_order->scan,
                            scan_order->iscan);
       break;
     case TX_16X16:
-      vp9_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      vp9_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+      vpx_fdct8x8(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -458,7 +590,7 @@
   struct optimize_ctx *const ctx = args->ctx;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int i, j;
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
@@ -476,20 +608,34 @@
   }
 
   if (!x->skip_recode) {
-    if (x->skip_txfm[plane] == 0) {
-      // full forward transform and quantization
-      if (x->quant_fp)
+    if (x->quant_fp) {
+      // Encoding process for rtc mode
+      if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
+        // skip forward transform
+        p->eobs[block] = 0;
+        *a = *l = 0;
+        return;
+      } else {
         vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-      else
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-    } else if (x->skip_txfm[plane] == 2) {
-      // fast path forward transform and quantization
-      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      }
     } else {
-      // skip forward transform
-      p->eobs[block] = 0;
-      *a = *l = 0;
-      return;
+      if (max_txsize_lookup[plane_bsize] == tx_size) {
+        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+        if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
+          // full forward transform and quantization
+          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+        } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
+          // fast path forward transform and quantization
+          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+        } else {
+          // skip forward transform
+          p->eobs[block] = 0;
+          *a = *l = 0;
+          return;
+        }
+      } else {
+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      }
     }
   }
 
@@ -505,6 +651,34 @@
 
   if (x->skip_encode || p->eobs[block] == 0)
     return;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_16X16:
+        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_8X8:
+        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride,
+                               p->eobs[block], xd->bd);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride,
+                           p->eobs[block], xd->bd);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   switch (tx_size) {
     case TX_32X32:
@@ -534,7 +708,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int i, j;
   uint8_t *dst;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -542,8 +716,15 @@
 
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] > 0)
+  if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+       x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+       return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  }
 }
 
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -559,6 +740,11 @@
   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
   int plane;
 
+  mbmi->skip = 1;
+
+  if (x->skip)
+    return;
+
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     if (!x->skip_recode)
       vp9_subtract_plane(x, bsize, plane);
@@ -575,21 +761,21 @@
   }
 }
 
-static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *scan_order;
-  TX_TYPE tx_type;
+  TX_TYPE tx_type = DCT_DCT;
   PREDICTION_MODE mode;
-  const int bwl = b_width_log2(plane_bsize);
+  const int bwl = b_width_log2_lookup[plane_bsize];
   const int diff_stride = 4 * (1 << bwl);
   uint8_t *src, *dst;
   int16_t *src_diff;
@@ -602,85 +788,163 @@
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
+  if (tx_size == TX_4X4) {
+    tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+    scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+    mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
+  } else {
+    mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+    if (tx_size == TX_32X32) {
+      scan_order = &vp9_default_scan_orders[TX_32X32];
+    } else {
+      tx_type = get_tx_type(pd->plane_type, xd);
+      scan_order = &vp9_scan_orders[tx_size][tx_type];
+    }
+  }
+
+  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
+                          x->skip_encode ? src_stride : dst_stride,
+                          dst, dst_stride, i, j, plane);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant, eob,
+                                      scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+        }
+        break;
+      case TX_16X16:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type == DCT_DCT)
+            vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
+                                  *eob, xd->bd);
+        }
+        break;
+      case TX_8X8:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type == DCT_DCT)
+            vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+                                xd->bd);
+        }
+        break;
+      case TX_4X4:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type != DCT_DCT)
+            vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          else
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+
+        if (!x->skip_encode && *eob) {
+          if (tx_type == DCT_DCT) {
+            // this is like vp9_short_idct4x4 but has a special case around
+            // eob<=1 which is significant (not just an optimization) for the
+            // lossless case.
+            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          } else {
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+          }
+        }
+        break;
+      default:
+        assert(0);
+        return;
+    }
+    if (*eob)
+      *(args->skip) = 0;
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   switch (tx_size) {
     case TX_32X32:
-      scan_order = &vp9_default_scan_orders[TX_32X32];
-      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(32, 32, src_diff, diff_stride,
+        vpx_subtract_block(32, 32, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
-      tx_type = get_tx_type(pd->plane_type, xd);
-      scan_order = &vp9_scan_orders[TX_16X16][tx_type];
-      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(16, 16, src_diff, diff_stride,
+        vpx_subtract_block(16, 16, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
-      tx_type = get_tx_type(pd->plane_type, xd);
-      scan_order = &vp9_scan_orders[TX_8X8][tx_type];
-      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(8, 8, src_diff, diff_stride,
+        vpx_subtract_block(8, 8, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_4X4:
-      tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
-      scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride, i, j, plane);
-
       if (!x->skip_recode) {
-        vp9_subtract_block(4, 4, src_diff, diff_stride,
+        vpx_subtract_block(4, 4, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
 
@@ -702,18 +966,10 @@
     *(args->skip) = 0;
 }
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip) {
-  struct encode_b_args arg = {x, NULL, skip};
-  encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
-}
-
-
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
 
-  vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
-                                         &arg);
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         vp9_encode_block_intra, &arg);
 }

diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index 1999718..97df8a6 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h

@@ -13,13 +13,16 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+};
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
@@ -31,9 +34,8 @@
 
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 

diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 9ad6db0..7848c93 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c

@@ -12,7 +12,6 @@
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -22,14 +21,14 @@
 static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
 static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
 
-void vp9_entropy_mv_init() {
+void vp9_entropy_mv_init(void) {
   vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
   vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
   vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
   vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
 }
 
-static void encode_mv_component(vp9_writer* w, int comp,
+static void encode_mv_component(vpx_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
   const int sign = comp < 0;
@@ -42,7 +41,7 @@
   assert(comp != 0);
 
   // Sign
-  vp9_write(w, sign, mvcomp->sign);
+  vpx_write(w, sign, mvcomp->sign);
 
   // Class
   vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
@@ -56,7 +55,7 @@
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i)
-      vp9_write(w, (d >> i) & 1, mvcomp->bits[i]);
+      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
   // Fractional bits
@@ -66,7 +65,7 @@
 
   // High precision bit
   if (usehp)
-    vp9_write(w, hp,
+    vpx_write(w, hp,
               mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
@@ -133,23 +132,23 @@
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
-                     vp9_prob upd_p) {
-  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                     vpx_prob upd_p) {
+  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
                      cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
-  vp9_write(w, update, upd_p);
+  vpx_write(w, update, upd_p);
   if (update) {
     *cur_p = new_p;
-    vp9_write_literal(w, new_p >> 1, 7);
+    vpx_write_literal(w, new_p >> 1, 7);
   }
   return update;
 }
 
-static void write_mv_update(const vp9_tree_index *tree,
-                            vp9_prob probs[/*n - 1*/],
+static void write_mv_update(const vpx_tree_index *tree,
+                            vpx_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/],
-                            int n, vp9_writer *w) {
+                            int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -161,10 +160,10 @@
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts) {
   int i, j;
-  nmv_context *const mvc = &cm->fc.nmvc;
-  nmv_context_counts *const counts = &cm->counts.mv;
+  nmv_context *const mvc = &cm->fc->nmvc;
 
   write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
@@ -199,7 +198,7 @@
   }
 }
 
-void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
                    const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,
@@ -216,7 +215,7 @@
 
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
-  if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
+  if (cpi->sf.mv.auto_mv_step_size) {
     unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
     cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
   }
@@ -229,21 +228,24 @@
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
                     nmv_context_counts *counts) {
   int i;
 
   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    const MV *ref = &mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
     const MV diff = {mvs[i].as_mv.row - ref->row,
                      mvs[i].as_mv.col - ref->col};
     vp9_inc_mv(&diff, counts);
   }
 }
 
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
+void vp9_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MODE_INFO *mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
 
   if (mbmi->sb_type < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -254,12 +256,12 @@
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv);
+          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
       }
     }
   } else {
     if (mbmi->mode == NEWMV)
-      inc_mvs(mbmi, mbmi->mv, &cm->counts.mv);
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv, &td->counts->mv);
   }
 }
 

diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h
index e67f9e3..5fb114c 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libvpx/vp9/encoder/vp9_encodemv.h

@@ -18,17 +18,18 @@
 extern "C" {
 #endif
 
-void vp9_entropy_mv_init();
+void vp9_entropy_mv_init(void);
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts);
 
-void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context* mvctx, int usehp);
 
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd);
+void vp9_update_mv_count(ThreadData *td);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index 524744c..4654d63 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c

@@ -12,9 +12,17 @@
 #include <stdio.h>
 #include <limits.h>
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -24,7 +32,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/encoder/vp9_aq_complexity.h"
@@ -34,22 +42,22 @@
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/encoder/vp9_speed_features.h"
-#if CONFIG_INTERNAL_STATS
-#include "vp9/encoder/vp9_ssim.h"
-#endif
-#include "vp9/encoder/vp9_temporal_filter.h"
 #include "vp9/encoder/vp9_resize.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 
-void vp9_coef_tree_initialize();
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
 
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
 
@@ -59,12 +67,14 @@
                                          // mv. Choose a very high value for
                                          // now so that HIGH_PRECISION is always
                                          // chosen.
-
 // #define OUTPUT_YUV_REC
 
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
@@ -101,8 +111,111 @@
   }
 }
 
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(VP9_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(VP9_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      vp9_enable_segmentation(seg);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+      // filter level being zero regardless of the value of seg->abs_delta.
+      vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE,
+                      SEG_LVL_ALT_LF, -MAX_LOOP_FILTER);
+    } else {
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int vp9_set_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> 1) * cols + (c >> 1)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_get_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
-  MACROBLOCK *const mb = &cpi->mb;
+  MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
   if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
@@ -128,25 +241,92 @@
   }
 
   if (cm->frame_type == KEY_FRAME) {
-    if (!is_spatial_svc(cpi))
+    if (!is_two_pass_svc(cpi))
       cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
+    vp9_zero(cpi->interp_filter_selected);
   } else {
-    cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    vp9_zero(cpi->interp_filter_selected[0]);
   }
 }
 
-void vp9_initialize_enc() {
-  static int init_done = 0;
+static void vp9_enc_setup_mi(VP9_COMMON *cm) {
+  int i;
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  cm->prev_mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->prev_mi_grid_base)
+    return 1;
+
+  return 0;
+}
+
+static void vp9_enc_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+}
+
+static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void vp9_initialize_enc(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
-    vp9_init_neighbors();
-    vp9_coef_tree_initialize();
-    vp9_tokenize_initialize();
+    vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp9_init_intra_predictors();
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
-    vp9_entropy_mode_init();
     vp9_temporal_filter_init();
     init_done = 1;
   }
@@ -156,33 +336,60 @@
   VP9_COMMON *const cm = &cpi->common;
   int i;
 
+  vpx_free(cpi->mbmi_ext_base);
+  cpi->mbmi_ext_base = NULL;
+
+  vpx_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
   cpi->coding_context.last_frame_seg_map_copy = NULL;
 
-  vpx_free(cpi->complexity_map);
-  cpi->complexity_map = NULL;
+  vpx_free(cpi->nmvcosts[0]);
+  vpx_free(cpi->nmvcosts[1]);
+  cpi->nmvcosts[0] = NULL;
+  cpi->nmvcosts[1] = NULL;
+
+  vpx_free(cpi->nmvcosts_hp[0]);
+  vpx_free(cpi->nmvcosts_hp[1]);
+  cpi->nmvcosts_hp[0] = NULL;
+  cpi->nmvcosts_hp[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts[0]);
+  vpx_free(cpi->nmvsadcosts[1]);
+  cpi->nmvsadcosts[0] = NULL;
+  cpi->nmvsadcosts[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts_hp[0]);
+  vpx_free(cpi->nmvsadcosts_hp[1]);
+  cpi->nmvsadcosts_hp[0] = NULL;
+  cpi->nmvsadcosts_hp[1] = NULL;
 
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
-  vp9_free_ref_frame_buffers(cm);
+  vpx_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vp9_free_context_buffers(cm);
 
-  vp9_free_frame_buffer(&cpi->last_frame_uf);
-  vp9_free_frame_buffer(&cpi->scaled_source);
-  vp9_free_frame_buffer(&cpi->scaled_last_source);
-  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
+  vpx_free_frame_buffer(&cpi->last_frame_uf);
+  vpx_free_frame_buffer(&cpi->scaled_source);
+  vpx_free_frame_buffer(&cpi->scaled_last_source);
+  vpx_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
-  vpx_free(cpi->tok);
-  cpi->tok = 0;
+  vpx_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
 
-  vp9_free_pc_tree(cpi);
+  vp9_free_pc_tree(&cpi->td);
 
   for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
     LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
@@ -197,10 +404,13 @@
   }
 
   for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cpi->svc.scaled_frames[i]);
+    vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]);
   }
-  vpx_memset(&cpi->svc.scaled_frames[0], 0,
-             MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+  memset(&cpi->svc.scaled_frames[0], 0,
+         MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+
+  vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
+  memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 }
 
 static void save_coding_context(VP9_COMP *cpi) {
@@ -211,19 +421,26 @@
   // restored with a call to vp9_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
+  vp9_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);
+
+  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+         MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+         MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+  memcpy(cpi->coding_context.last_frame_seg_map_copy,
+         cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  cc->fc = cm->fc;
+  cc->fc = *cm->fc;
 }
 
 static void restore_coding_context(VP9_COMP *cpi) {
@@ -232,20 +449,25 @@
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+  vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+
+  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-  vpx_memcpy(cm->last_frame_seg_map,
-             cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mi_rows * cm->mi_cols));
+  memcpy(cm->last_frame_seg_map,
+         cpi->coding_context.last_frame_seg_map_copy,
+         (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  cm->fc = cc->fc;
+  *cm->fc = cc->fc;
 }
 
 static void configure_static_seg_features(VP9_COMP *cpi) {
@@ -259,7 +481,7 @@
   // Disable and clear down for KF
   if (cm->frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
     cpi->static_mb_pct = 0;
@@ -272,7 +494,7 @@
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
     cpi->static_mb_pct = 0;
@@ -291,7 +513,8 @@
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875);
+      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->bit_depth);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
@@ -312,7 +535,8 @@
         seg->update_data = 1;
         seg->abs_delta = SEGMENT_DELTADATA;
 
-        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125);
+        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->bit_depth);
         vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -331,7 +555,7 @@
 
         vp9_disable_segmentation(seg);
 
-        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
         seg->update_map = 0;
         seg->update_data = 0;
@@ -386,126 +610,148 @@
   }
 }
 
-
-static void set_speed_features(VP9_COMP *cpi) {
-#if CONFIG_INTERNAL_STATS
-  int i;
-  for (i = 0; i < MAX_MODES; ++i)
-    cpi->mode_chosen_counts[i] = 0;
-#endif
-
-  vp9_set_speed_features(cpi);
-
-  // Set rd thresholds based on mode and speed setting
-  vp9_set_rd_speed_thresholds(cpi);
-  vp9_set_rd_speed_thresholds_sub8x8(cpi);
-}
-
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
 
-  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
-                                      cm->subsampling_x, cm->subsampling_y,
+  if (!cpi->lookahead)
+    cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+                                        cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      cm->use_highbitdepth,
+#endif
                                       oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
-  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
 
-static void alloc_ref_frame_buffers(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffers");
-}
-
 static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+  if (vpx_realloc_frame_buffer(&cpi->last_frame_uf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+  if (vpx_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
-  if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
+  if (vpx_realloc_frame_buffer(&cpi->scaled_last_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
 
+
+static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
+  if (!cpi->mbmi_ext_base)
+    return 1;
+
+  return 0;
+}
+
 void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
   vp9_alloc_context_buffers(cm, cm->width, cm->height);
 
-  vpx_free(cpi->tok);
+  alloc_context_buffers_ext(cpi);
+
+  vpx_free(cpi->tile_tok[0][0]);
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+        vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
-  vp9_setup_pc_tree(&cpi->common, cpi);
-}
-
-static void update_frame_size(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  vp9_set_mb_mi(cm, cm->width, cm->height);
-  vp9_init_context_buffers(cm);
-  init_macroblockd(cm, xd);
-
-  if (is_spatial_svc(cpi)) {
-    if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                                 cm->width, cm->height,
-                                 cm->subsampling_x, cm->subsampling_y,
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to reallocate alt_ref_buffer");
-  }
+  vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
-  cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate;
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
   vp9_rc_update_framerate(cpi);
 }
 
-int64_t vp9_rescale(int64_t val, int64_t num, int denom) {
-  int64_t llnum = num;
-  int64_t llden = denom;
-  int64_t llval = val;
-
-  return (llval * llnum / llden);
-}
-
 static void set_tile_limits(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
   int min_log2_tile_cols, max_log2_tile_cols;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                             min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  if (is_two_pass_svc(cpi) &&
+      (cpi->svc.encode_empty_frame_state == ENCODING ||
+      cpi->svc.number_spatial_layers > 1)) {
+    cm->log2_tile_cols = 0;
+    cm->log2_tile_rows = 0;
+  } else {
+    cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                               min_log2_tile_cols, max_log2_tile_cols);
+    cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  }
+}
+
+static void update_frame_size(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  vp9_init_context_buffers(cm);
+  vp9_init_macroblockd(cm, xd, NULL);
+  cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
+  memset(cpi->mbmi_ext_base, 0,
+         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  set_tile_limits(cpi);
+
+  if (is_two_pass_svc(cpi)) {
+    if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to reallocate alt_ref_buffer");
+  }
 }
 
 static void init_buffer_indices(VP9_COMP *cpi) {
@@ -518,24 +764,33 @@
   VP9_COMMON *const cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
 
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
-  cm->color_space = UNKNOWN;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+  cm->color_space = oxcf->color_space;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
   vp9_alloc_compressor_data(cpi);
 
+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
   // Spatial scalability.
   cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 &&
-      cpi->oxcf.mode == TWO_PASS_SECOND_BEST)) {
+  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass != 1)) {
     vp9_init_layer_context(cpi);
   }
 
@@ -546,8 +801,652 @@
   cpi->ref_frame_flags = 0;
 
   init_buffer_indices(cpi);
+}
 
-  set_tile_limits(cpi);
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const VP9EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+                                            : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+                                           : maximum * bandwidth / 1000;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf = SDF; \
+    cpi->fn_ptr[BT].sdaf = SDAF; \
+    cpi->fn_ptr[BT].vf = VF; \
+    cpi->fn_ptr[BT].svf = SVF; \
+    cpi->fn_ptr[BT].svaf = SVAF; \
+    cpi->fn_ptr[BT].sdx3f = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+               int source_stride, \
+               const uint8_t *ref_ptr, \
+               int ref_stride, \
+               const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t* const ref_ptr[], \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+
+static void  highbd_set_var_fns(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits8,
+                   vpx_highbd_sad32x16_avg_bits8,
+                   vpx_highbd_8_variance32x16,
+                   vpx_highbd_8_sub_pixel_variance32x16,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits8,
+                   vpx_highbd_sad16x32_avg_bits8,
+                   vpx_highbd_8_variance16x32,
+                   vpx_highbd_8_sub_pixel_variance16x32,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits8,
+                   vpx_highbd_sad64x32_avg_bits8,
+                   vpx_highbd_8_variance64x32,
+                   vpx_highbd_8_sub_pixel_variance64x32,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits8,
+                   vpx_highbd_sad32x64_avg_bits8,
+                   vpx_highbd_8_variance32x64,
+                   vpx_highbd_8_sub_pixel_variance32x64,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits8,
+                   vpx_highbd_sad32x32_avg_bits8,
+                   vpx_highbd_8_variance32x32,
+                   vpx_highbd_8_sub_pixel_variance32x32,
+                   vpx_highbd_8_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits8,
+                   vpx_highbd_sad32x32x8_bits8,
+                   vpx_highbd_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits8,
+                   vpx_highbd_sad64x64_avg_bits8,
+                   vpx_highbd_8_variance64x64,
+                   vpx_highbd_8_sub_pixel_variance64x64,
+                   vpx_highbd_8_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits8,
+                   vpx_highbd_sad64x64x8_bits8,
+                   vpx_highbd_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits8,
+                   vpx_highbd_sad16x16_avg_bits8,
+                   vpx_highbd_8_variance16x16,
+                   vpx_highbd_8_sub_pixel_variance16x16,
+                   vpx_highbd_8_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits8,
+                   vpx_highbd_sad16x16x8_bits8,
+                   vpx_highbd_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8,
+                   vpx_highbd_8_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits8,
+                   vpx_highbd_sad16x8x8_bits8,
+                   vpx_highbd_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8,
+                   vpx_highbd_8_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits8,
+                   vpx_highbd_sad8x16x8_bits8,
+                   vpx_highbd_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8,
+                   vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits8,
+                   vpx_highbd_sad8x8x8_bits8,
+                   vpx_highbd_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4,
+                   vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits8,
+                   vpx_highbd_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8,
+                   vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits8,
+                   vpx_highbd_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4,
+                   vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits8,
+                   vpx_highbd_sad4x4x8_bits8,
+                   vpx_highbd_sad4x4x4d_bits8)
+        break;
+
+      case VPX_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits10,
+                   vpx_highbd_sad32x16_avg_bits10,
+                   vpx_highbd_10_variance32x16,
+                   vpx_highbd_10_sub_pixel_variance32x16,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits10,
+                   vpx_highbd_sad16x32_avg_bits10,
+                   vpx_highbd_10_variance16x32,
+                   vpx_highbd_10_sub_pixel_variance16x32,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits10,
+                   vpx_highbd_sad64x32_avg_bits10,
+                   vpx_highbd_10_variance64x32,
+                   vpx_highbd_10_sub_pixel_variance64x32,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits10,
+                   vpx_highbd_sad32x64_avg_bits10,
+                   vpx_highbd_10_variance32x64,
+                   vpx_highbd_10_sub_pixel_variance32x64,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits10,
+                   vpx_highbd_sad32x32_avg_bits10,
+                   vpx_highbd_10_variance32x32,
+                   vpx_highbd_10_sub_pixel_variance32x32,
+                   vpx_highbd_10_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits10,
+                   vpx_highbd_sad32x32x8_bits10,
+                   vpx_highbd_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits10,
+                   vpx_highbd_sad64x64_avg_bits10,
+                   vpx_highbd_10_variance64x64,
+                   vpx_highbd_10_sub_pixel_variance64x64,
+                   vpx_highbd_10_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits10,
+                   vpx_highbd_sad64x64x8_bits10,
+                   vpx_highbd_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits10,
+                   vpx_highbd_sad16x16_avg_bits10,
+                   vpx_highbd_10_variance16x16,
+                   vpx_highbd_10_sub_pixel_variance16x16,
+                   vpx_highbd_10_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits10,
+                   vpx_highbd_sad16x16x8_bits10,
+                   vpx_highbd_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits10,
+                   vpx_highbd_sad16x8_avg_bits10,
+                   vpx_highbd_10_variance16x8,
+                   vpx_highbd_10_sub_pixel_variance16x8,
+                   vpx_highbd_10_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits10,
+                   vpx_highbd_sad16x8x8_bits10,
+                   vpx_highbd_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits10,
+                   vpx_highbd_sad8x16_avg_bits10,
+                   vpx_highbd_10_variance8x16,
+                   vpx_highbd_10_sub_pixel_variance8x16,
+                   vpx_highbd_10_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits10,
+                   vpx_highbd_sad8x16x8_bits10,
+                   vpx_highbd_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10,
+                   vpx_highbd_10_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits10,
+                   vpx_highbd_sad8x8x8_bits10,
+                   vpx_highbd_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits10,
+                   vpx_highbd_sad8x4_avg_bits10,
+                   vpx_highbd_10_variance8x4,
+                   vpx_highbd_10_sub_pixel_variance8x4,
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits10,
+                   vpx_highbd_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits10,
+                   vpx_highbd_sad4x8_avg_bits10,
+                   vpx_highbd_10_variance4x8,
+                   vpx_highbd_10_sub_pixel_variance4x8,
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits10,
+                   vpx_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10,
+                   vpx_highbd_10_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits10,
+                   vpx_highbd_sad4x4x8_bits10,
+                   vpx_highbd_sad4x4x4d_bits10)
+        break;
+
+      case VPX_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits12,
+                   vpx_highbd_sad32x16_avg_bits12,
+                   vpx_highbd_12_variance32x16,
+                   vpx_highbd_12_sub_pixel_variance32x16,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits12,
+                   vpx_highbd_sad16x32_avg_bits12,
+                   vpx_highbd_12_variance16x32,
+                   vpx_highbd_12_sub_pixel_variance16x32,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits12,
+                   vpx_highbd_sad64x32_avg_bits12,
+                   vpx_highbd_12_variance64x32,
+                   vpx_highbd_12_sub_pixel_variance64x32,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits12,
+                   vpx_highbd_sad32x64_avg_bits12,
+                   vpx_highbd_12_variance32x64,
+                   vpx_highbd_12_sub_pixel_variance32x64,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits12,
+                   vpx_highbd_sad32x32_avg_bits12,
+                   vpx_highbd_12_variance32x32,
+                   vpx_highbd_12_sub_pixel_variance32x32,
+                   vpx_highbd_12_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits12,
+                   vpx_highbd_sad32x32x8_bits12,
+                   vpx_highbd_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits12,
+                   vpx_highbd_sad64x64_avg_bits12,
+                   vpx_highbd_12_variance64x64,
+                   vpx_highbd_12_sub_pixel_variance64x64,
+                   vpx_highbd_12_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits12,
+                   vpx_highbd_sad64x64x8_bits12,
+                   vpx_highbd_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits12,
+                   vpx_highbd_sad16x16_avg_bits12,
+                   vpx_highbd_12_variance16x16,
+                   vpx_highbd_12_sub_pixel_variance16x16,
+                   vpx_highbd_12_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits12,
+                   vpx_highbd_sad16x16x8_bits12,
+                   vpx_highbd_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits12,
+                   vpx_highbd_sad16x8_avg_bits12,
+                   vpx_highbd_12_variance16x8,
+                   vpx_highbd_12_sub_pixel_variance16x8,
+                   vpx_highbd_12_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits12,
+                   vpx_highbd_sad16x8x8_bits12,
+                   vpx_highbd_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits12,
+                   vpx_highbd_sad8x16_avg_bits12,
+                   vpx_highbd_12_variance8x16,
+                   vpx_highbd_12_sub_pixel_variance8x16,
+                   vpx_highbd_12_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits12,
+                   vpx_highbd_sad8x16x8_bits12,
+                   vpx_highbd_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12,
+                   vpx_highbd_12_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits12,
+                   vpx_highbd_sad8x8x8_bits12,
+                   vpx_highbd_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits12,
+                   vpx_highbd_sad8x4_avg_bits12,
+                   vpx_highbd_12_variance8x4,
+                   vpx_highbd_12_sub_pixel_variance8x4,
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits12,
+                   vpx_highbd_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits12,
+                   vpx_highbd_sad4x8_avg_bits12,
+                   vpx_highbd_12_variance4x8,
+                   vpx_highbd_12_sub_pixel_variance4x8,
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits12,
+                   vpx_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12,
+                   vpx_highbd_12_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits12,
+                   vpx_highbd_sad4x4x8_bits12,
+                   vpx_highbd_sad4x4x4d_bits12)
+        break;
+
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  vpx_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh)
+    vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  vpx_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
@@ -557,15 +1456,19 @@
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
 
   if (cm->profile <= PROFILE_1)
-    assert(cm->bit_depth == BITS_8);
+    assert(cm->bit_depth == VPX_BITS_8);
   else
-    assert(cm->bit_depth > BITS_8);
+    assert(cm->bit_depth > VPX_BITS_8);
 
   cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
@@ -583,35 +1486,15 @@
   }
   cpi->encode_breakout = cpi->oxcf.encode_breakout;
 
-  // local file playback mode == really big buffer
-  if (cpi->oxcf.rc_mode == VPX_VBR) {
-    cpi->oxcf.starting_buffer_level_ms = 60000;
-    cpi->oxcf.optimal_buffer_level_ms = 60000;
-    cpi->oxcf.maximum_buffer_size_ms = 240000;
-  }
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
 
-  rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
-                                          cpi->oxcf.target_bandwidth, 1000);
-
-  // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level_ms == 0)
-    rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
-                                           cpi->oxcf.target_bandwidth, 1000);
-
-  if (cpi->oxcf.maximum_buffer_size_ms == 0)
-    rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
-                                          cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
   rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
-  vp9_new_framerate(cpi, cpi->oxcf.framerate);
+  vp9_new_framerate(cpi, cpi->framerate);
 
   // Set absolute upper and lower quality limits
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -621,19 +1504,24 @@
 
   cm->display_width = cpi->oxcf.width;
   cm->display_height = cpi->oxcf.height;
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
 
   if (cpi->initial_width) {
-    // Increasing the size of the frame beyond the first seen frame, or some
-    // otherwise signaled maximum size, is not supported.
-    // TODO(jkoleszar): exit gracefully.
-    assert(cm->width <= cpi->initial_width);
-    assert(cm->height <= cpi->initial_height);
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+      vp9_free_context_buffers(cm);
+      vp9_alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
   }
   update_frame_size(cpi);
 
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass != 1)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -652,12 +1540,8 @@
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                       cm->subsampling_x, cm->subsampling_y,
-                       VP9_ENC_BORDER_IN_PIXELS);
-  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
 #endif
 }
 
@@ -704,10 +1588,11 @@
 }
 
 
-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
-  unsigned int i, j;
-  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -721,38 +1606,49 @@
   }
 
   cm->error.setjmp = 1;
+  cm->alloc_mi = vp9_enc_alloc_mi;
+  cm->free_mi = vp9_enc_free_mi;
+  cm->setup_mi = vp9_enc_setup_mi;
 
-  vp9_rtcd();
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
 
   cpi->use_svc = 0;
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
+  cpi->common.buffer_pool = pool;
+
+  cpi->rc.high_source_sad = 0;
 
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
 
-  cpi->gold_is_last = 0;
-  cpi->alt_is_last = 0;
-  cpi->gold_is_alt = 0;
+  realloc_segmentation_maps(cpi);
 
-  cpi->skippable_frame = 0;
-
-  // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // Create a complexity map used for rd adjustment
-  CHECK_MEM_ERROR(cm, cpi->complexity_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // Create a map used for cyclic background refresh.
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
-                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
-
-  // And a place holder structure is the coding context
-  // for use if we want to save and restore it
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                    sizeof(cpi->mbgraph_stats[0])); i++) {
@@ -773,45 +1669,24 @@
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
-
-  // Note that at the moment multi_arf will not work with svc.
-  // For the current check in all the execution paths are defaulted to 0
-  // pending further tuning and testing. The code is left in place here
-  // as a place holder in regard to the required paths.
   cpi->multi_arf_last_grp_enabled = 0;
-  if (oxcf->pass == 2) {
-    if (cpi->use_svc) {
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    } else {
-      // Disable by default for now.
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    }
-  } else {
-    cpi->multi_arf_allowed = 0;
-    cpi->multi_arf_enabled = 0;
-  }
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
 
   cpi->count = 0;
   cpi->bytes = 0;
 
   if (cpi->b_calculate_psnr) {
-    cpi->total_y = 0.0;
-    cpi->total_u = 0.0;
-    cpi->total_v = 0.0;
-    cpi->total = 0.0;
     cpi->total_sq_error = 0;
     cpi->total_samples = 0;
 
-    cpi->totalp_y = 0.0;
-    cpi->totalp_u = 0.0;
-    cpi->totalp_v = 0.0;
-    cpi->totalp = 0.0;
     cpi->totalp_sq_error = 0;
     cpi->totalp_samples = 0;
 
@@ -823,34 +1698,48 @@
   }
 
   if (cpi->b_calculate_ssimg) {
-    cpi->total_ssimg_y = 0;
-    cpi->total_ssimg_u = 0;
-    cpi->total_ssimg_v = 0;
-    cpi->total_ssimg_all = 0;
+    cpi->ssimg.worst= 100.0;
+  }
+  cpi->fastssim.worst = 100.0;
+
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) *
+                                4 * cpi->common.mi_rows * cpi->common.mi_cols);
+    cpi->worst_consistency = 100.0;
   }
 
 #endif
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->mb.nmvsadcost);
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
 
-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
@@ -860,8 +1749,6 @@
   kf_list = fopen("kf_list.stt", "w");
 #endif
 
-  cpi->output_pkt_list = oxcf->output_pkt_list;
-
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (oxcf->pass == 1) {
@@ -871,7 +1758,7 @@
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
     if (cpi->svc.number_spatial_layers > 1
-        && cpi->svc.number_temporal_layers == 1) {
+        || cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
       int i;
@@ -929,7 +1816,8 @@
     }
   }
 
-  set_speed_features(cpi);
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_speed_features_framesize_dependent(cpi);
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -937,12 +1825,6 @@
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
 
-  // Default rd threshold factors for mode selection
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    for (j = 0; j < MAX_MODES; ++j)
-      cpi->rd.thresh_freq_fact[i][j] = 32;
-  }
-
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].sdaf           = SDAF; \
@@ -953,68 +1835,68 @@
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
-  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
-      vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
 
-  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
-      vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
 
-  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
-      vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
 
-  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
-      vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
 
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
-      vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
-      vp9_sad32x32x4d)
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_sad32x32x4d)
 
-  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
-      vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
-      vp9_sad64x64x4d)
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_sad64x64x4d)
 
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
-      vp9_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
-      vp9_sad16x16x4d)
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_sad16x16x4d)
 
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
-      vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8,
-      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8,
+      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
 
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
-      vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16,
-      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16,
+      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
 
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
-      vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8,
-      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
+      vpx_variance8x8, vpx_sub_pixel_variance8x8,
+      vpx_sub_pixel_avg_variance8x8,
+      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
 
-  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
-      vp9_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
+      vpx_variance8x4, vpx_sub_pixel_variance8x4,
+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
 
-  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
-      vp9_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
+      vpx_variance4x8, vpx_sub_pixel_variance4x8,
+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
 
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
-      vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4,
-      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
+      vpx_variance4x4, vpx_sub_pixel_variance4x4,
+      vpx_sub_pixel_avg_variance4x4,
+      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
-  cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = vp9_diamond_search_sad;
-  cpi->refining_search_sad = vp9_refining_search_sad;
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
 
   /* vp9_init_quantizer() is first called here. Add check in
    * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
@@ -1029,55 +1911,88 @@
 
   return cpi;
 }
+#define SNPRINT(H, T) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   unsigned int i;
+  int t;
 
   if (!cpi)
     return;
 
-  if (cpi && (cpi->common.current_video_frame > 0)) {
+  if (cpi && (cm->current_video_frame > 0)) {
 #if CONFIG_INTERNAL_STATS
+    vpx_clear_system_state();
 
-    vp9_clear_system_state();
-
-    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
     if (cpi->oxcf.pass != 1) {
+      char headings[512] = {0};
+      char results[512] = {0};
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded = (cpi->last_end_time_stamp_seen
                              - cpi->first_time_stamp_ever) / 10000000.000;
       double total_encode_time = (cpi->time_receive_data +
                                   cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000
-                  / time_encoded;
+      const double dr =
+          (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
-            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+            vpx_sse_to_psnr((double)cpi->total_samples, peak,
                             (double)cpi->total_sq_error);
         const double totalp_psnr =
-            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+            vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
                             (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
-                                                cpi->summed_weights, 8.0);
+                                            cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
-                                                cpi->summedp_weights, 8.0);
+                                             cpi->summedp_weights, 8.0);
 
-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
-                dr, cpi->total / cpi->count, total_psnr,
-                cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim,
-                total_encode_time);
-      }
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
+                 total_ssim, totalp_ssim,
+                 cpi->fastssim.stat[ALL] / cpi->count,
+                 cpi->psnrhvs.stat[ALL] / cpi->count,
+                 cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
+                 cpi->psnrhvs.worst);
 
-      if (cpi->b_calculate_ssimg) {
-        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count,
-                cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count,
-                cpi->total_ssimg_all / cpi->count, total_encode_time);
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+
+        if (cpi->b_calculate_ssimg) {
+          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
+        }
+
+        fprintf(f, "%s\t    Time\n", headings);
+        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
       }
 
       fclose(f);
@@ -1098,13 +2013,30 @@
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_free(&(cpi->denoiser));
-  }
+  vp9_denoiser_free(&(cpi->denoiser));
 #endif
 
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
   dealloc_compressor_data(cpi);
-  vpx_free(cpi->tok);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
                   sizeof(cpi->mbgraph_stats[0]); ++i) {
@@ -1118,7 +2050,11 @@
   }
 #endif
 
-  vp9_remove_common(&cpi->common);
+  vp9_remove_common(cm);
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vpx_free(cpi);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1126,6 +2062,9 @@
   fclose(yuv_denoised_file);
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
@@ -1143,6 +2082,66 @@
 
 #endif
 }
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+ * and highbd_8_variance(). It should not.
+ */
+static void encoder_variance(const uint8_t *a, int  a_stride,
+                             const uint8_t *b, int  b_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h, uint64_t *sse,
+                                      uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h,
+                                      unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+                            &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int64_t get_sse(const uint8_t *a, int a_stride,
                        const uint8_t *b, int b_stride,
                        int width, int height) {
@@ -1154,15 +2153,15 @@
   int x, y;
 
   if (dw > 0) {
-    variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-             dw, height, &sse, &sum);
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                     dw, height, &sse, &sum);
     total_sse += sse;
   }
 
   if (dh > 0) {
-    variance(&a[(height - dh) * a_stride], a_stride,
-             &b[(height - dh) * b_stride], b_stride,
-             width - dw, dh, &sse, &sum);
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+                     &b[(height - dh) * b_stride], b_stride,
+                     width - dw, dh, &sse, &sum);
     total_sse += sse;
   }
 
@@ -1170,7 +2169,7 @@
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
 
       pa += 16;
@@ -1184,6 +2183,64 @@
   return total_sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride,
+                              &b[width - dw], b_stride,
+                              dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef struct {
   double psnr[4];       // total/y/u/v
   uint64_t sse[4];      // total/y/u/v
@@ -1192,11 +2249,14 @@
 
 static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                       PSNR_STATS *psnr) {
-  const int widths[3]        = {a->y_width,  a->uv_width,  a->uv_width };
-  const int heights[3]       = {a->y_height, a->uv_height, a->uv_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  static const double peak = 255.0;
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
   const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
   const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
   int i;
   uint64_t total_sse = 0;
@@ -1211,7 +2271,7 @@
                                  w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -1219,22 +2279,86 @@
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
                                   (double)total_sse);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b,
+                             PSNR_STATS *psnr,
+                             unsigned int bit_depth,
+                             unsigned int in_bit_depth) {
+  const int widths[3] =
+      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
+  const int heights[3] =
+      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+                                   b_planes[i], b_strides[i], w, h,
+                                   input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+                             b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+                    b_planes[i], b_strides[i],
+                    w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
   calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
   for (i = 0; i < 4; ++i) {
     pkt.data.psnr.samples[i] = psnr.samples[i];
     pkt.data.psnr.sse[i] = psnr.sse[i];
     pkt.data.psnr.psnr[i] = psnr.psnr[i];
   }
   pkt.kind = VPX_CODEC_PSNR_PKT;
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  if (cpi->use_svc)
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+        cpi->svc.number_temporal_layers].psnr_pkt = pkt.data.psnr;
+  else
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
 
 int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -1293,8 +2417,7 @@
   return 0;
 }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-#if defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
 // not denoise the UV channels at this time. If ever we implement UV channel
@@ -1309,23 +2432,22 @@
   } while (--h);
 
   src = s->u_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 
   src = s->v_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 }
 #endif
-#endif
 
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
@@ -1333,6 +2455,36 @@
   uint8_t *src = s->y_buffer;
   int h = cm->height;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2,  yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2,  yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   do {
     fwrite(src, s->y_width, 1,  yuv_rec_file);
     src += s->y_stride;
@@ -1358,8 +2510,14 @@
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
 static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                                 YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
   const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
@@ -1375,15 +2533,31 @@
   const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
                               dst->uv_crop_height};
 
-  for (i = 0; i < MAX_MB_PLANE; ++i)
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
     vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
                      dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-
-  vp9_extend_frame_borders(dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  vpx_extend_frame_borders(dst);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int bd) {
+#else
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                    YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
@@ -1392,7 +2566,7 @@
   const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
   uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
-  const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP);
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
   int x, y, i;
 
   for (y = 0; y < dst_h; y += 16) {
@@ -1407,73 +2581,69 @@
                                      src_stride + (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
-        vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
+        } else {
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+#else
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                       kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                       kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
 
-  vp9_extend_frame_borders(dst);
+  vpx_extend_frame_borders(dst);
 }
 
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  FILE *yframe;
-  int i;
-  char filename[255];
+static int scale_down(VP9_COMP *cpi, int q) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  int scale = 0;
+  assert(frame_is_kf_gf_arf(cpi));
 
-  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
+  if (rc->frame_size_selector == UNSCALED &&
+      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+    const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
+        * MAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+  }
+  return scale;
 }
-#endif
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
-static int recode_loop_test(const VP9_COMP *cpi,
+static int recode_loop_test(VP9_COMP *cpi,
                             int high_limit, int low_limit,
                             int q, int maxq, int minq) {
-  const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
   int force_recode = 0;
 
-  // Special case trap if maximum allowed frame size exceeded.
-  if (rc->projected_frame_size > rc->max_frame_bandwidth) {
-    force_recode = 1;
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    if (frame_is_kfgfarf &&
+        (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+        scale_down(cpi, q)) {
+        // Code this group at a lower resolution.
+        cpi->resize_pending = 1;
+        return 1;
+    }
 
-  // Is frame recode allowed.
-  // Yes if either recode mode 1 is selected or mode 2 is selected
-  // and the frame is a key frame, golden frame or alt_ref_frame
-  } else if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
-             ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) &&
-              (cm->frame_type == KEY_FRAME ||
-               cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-    // General over and under shoot tests
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
         (rc->projected_frame_size < low_limit && q > minq)) {
       force_recode = 1;
@@ -1491,13 +2661,14 @@
 
 void vp9_update_reference_frames(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
   } else if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
@@ -1510,14 +2681,14 @@
     // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
 
-    if (is_spatial_svc(cpi)) {
+    if (is_two_pass_svc(cpi)) {
       cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
       cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
     }
@@ -1529,19 +2700,34 @@
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
 
-      ref_cnt_fb(cm->frame_bufs,
+      ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 
     if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(cm->frame_bufs,
+      ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+      if (!cpi->rc.is_src_frame_alt_ref)
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+      else
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[ALTREF_FRAME],
+               sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
   }
 
   if (cpi->refresh_last_frame) {
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    if (!cpi->rc.is_src_frame_alt_ref)
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1556,14 +2742,14 @@
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
   if (xd->lossless) {
       lf->filter_level = 0;
   } else {
     struct vpx_usec_timer timer;
 
-    vp9_clear_system_state();
+    vpx_clear_system_state();
 
     vpx_usec_timer_start(&timer);
 
@@ -1574,10 +2760,31 @@
   }
 
   if (lf->filter_level > 0) {
-    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+    if (cpi->num_workers > 1)
+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
   }
 
-  vp9_extend_frame_inner_borders(cm->frame_to_show);
+  vpx_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+                                   int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL ||
+      new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    new_fb_ptr->mvs =
+      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                           sizeof(*new_fb_ptr->mvs));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
 }
 
 void vp9_scale_references(VP9_COMP *cpi) {
@@ -1586,22 +2793,78 @@
   const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
-
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
-    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
-        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
-      const int new_fb = get_free_fb(cm);
-      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
-      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                                 ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   cm->use_highbitdepth,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#else
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++buf->ref_count;
+      }
     } else {
-      cpi->scaled_ref_idx[ref_frame - 1] = idx;
-      cm->frame_bufs[idx].ref_count++;
+      if (cpi->oxcf.pass != 0 || cpi->use_svc)
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
     }
   }
 }
@@ -1609,9 +2872,37 @@
 static void release_scaled_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
-
-  for (i = 0; i < 3; i++)
-    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+  if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+    // Only release scaled references under certain conditions:
+    // if reference will be updated, or if scaled reference has same resolution.
+    int refresh[3];
+    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      const int idx = cpi->scaled_ref_idx[i - 1];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+      if (buf != NULL &&
+          (refresh[i - 1] ||
+          (buf->buf.y_crop_width == ref->y_crop_width &&
+           buf->buf.y_crop_height == ref->y_crop_height))) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i -1] = INVALID_IDX;
+      }
+    }
+  } else {
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      const int idx = cpi->scaled_ref_idx[i];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      if (buf != NULL) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i] = INVALID_IDX;
+      }
+    }
+  }
 }
 
 static void full_to_model_count(unsigned int *model_count,
@@ -1640,32 +2931,41 @@
 static void output_frame_level_debug_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int recon_err;
+  int64_t recon_err;
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %10d %10d %10d %10d"
-        "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
-        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
+       "%10"PRId64" %10"PRId64" %10d "
+       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10d %10d %10d\n",
-        cpi->common.current_video_frame, cpi->rc.this_frame_target,
+        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        cpi->common.current_video_frame,
+        cm->width, cm->height,
+        cpi->rc.source_alt_ref_pending,
+        cpi->rc.source_alt_ref_active,
+        cpi->rc.this_frame_target,
         cpi->rc.projected_frame_size,
         cpi->rc.projected_frame_size / cpi->common.MBs,
         (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
         cpi->rc.vbr_bits_off_target,
+        cpi->rc.vbr_bits_off_target_fast,
+        cpi->twopass.extend_minq,
+        cpi->twopass.extend_minq_fast,
         cpi->rc.total_target_vs_actual,
         (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
         cpi->rc.total_actual_bits, cm->base_qindex,
-        vp9_convert_qindex_to_q(cm->base_qindex),
-        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality),
+        vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
         cpi->rc.avg_q,
-        vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
+        vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
         cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
         cpi->twopass.bits_left,
@@ -1673,7 +2973,8 @@
         cpi->twopass.bits_left /
             (1 + cpi->twopass.total_left_stats.coded_error),
         cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
-        cpi->twopass.kf_zeromotion_pct);
+        cpi->twopass.kf_zeromotion_pct,
+        cpi->twopass.fr_content_type);
 
   fclose(f);
 
@@ -1695,12 +2996,229 @@
 }
 #endif
 
-static void encode_without_recode_loop(VP9_COMP *cpi,
-                                       int q) {
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param =
+            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(VP9_COMP *cpi) {
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
+                                    int *bottom_index, int *top_index) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_clear_system_state();
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  vp9_set_speed_features_framesize_dependent(cpi);
+
+  // Decide q and q bounds.
+  *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC
+  if (oxcf->noise_sensitivity > 0) {
+    int l = 0;
+    switch (oxcf->noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp9_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif  // CONFIG_VP9_POSTPROC
+}
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+static void set_frame_size(VP9_COMP *cpi) {
+  int ref_frame;
+  VP9_COMMON *const cm = &cpi->common;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (oxcf->pass == 2 &&
+      oxcf->rc_mode == VPX_VBR &&
+      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+    calculate_coded_size(
+        cpi, &oxcf->scaled_frame_width, &oxcf->scaled_frame_height);
+
+    // There has been a change in frame size.
+    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      !cpi->use_svc &&
+      oxcf->resize_mode == RESIZE_DYNAMIC) {
+      if (cpi->resize_pending == 1) {
+        oxcf->scaled_frame_width =
+            (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+        oxcf->scaled_frame_height =
+            (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+      } else if (cpi->resize_pending == -1) {
+        // Go back up to original size.
+        oxcf->scaled_frame_width = oxcf->width;
+        oxcf->scaled_frame_height = oxcf->height;
+      }
+      if (cpi->resize_pending != 0) {
+        // There has been a change in frame size.
+        vp9_set_size_literal(cpi,
+                             oxcf->scaled_frame_width,
+                             oxcf->scaled_frame_height);
+
+        // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+        set_mv_search_params(cpi);
+      }
+  }
+
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_set_target_rate(cpi);
+  }
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Reset the frame pointers to the current frame size.
+  vpx_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                           NULL, NULL, NULL);
+
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height,
+                                        (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                            1 : 0);
+#else
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      if (vp9_is_scaled(&ref_buf->sf))
+        vpx_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP9_COMP *cpi,
+                                       size_t *size,
+                                       uint8_t *dest) {
+  VP9_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+
+  vpx_clear_system_state();
+
+  set_frame_size(cpi);
+
+  cpi->Source = vp9_scale_if_required(cm,
+                                      cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+  if (cpi->unscaled_last_source != NULL)
+    cpi->Last_Source = vp9_scale_if_required(cm,
+                                             cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->resize_state == 0 &&
+      cm->frame_type != KEY_FRAME &&
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    vp9_avg_source_sad(cpi);
+
+  if (frame_is_intra_only(cm) == 0) {
+    vp9_scale_references(cpi);
+  }
+
+  set_size_independent_vars(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
   vp9_set_quantizer(cm, q);
+  vp9_set_variance_partition_thresholds(cpi, q);
+
   setup_frame(cpi);
+
+  suppress_active_map(cpi);
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -1710,38 +3228,118 @@
   } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
     vp9_cyclic_refresh_setup(cpi);
   }
+  apply_active_map(cpi);
+
   // transform / motion compensation build reconstruction frame
   vp9_encode_frame(cpi);
 
+  // Check if we should drop this frame because of high overshoot.
+  // Only for frames where high temporal-source sad is detected.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->resize_state == 0 &&
+      cm->frame_type != KEY_FRAME &&
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->rc.high_source_sad == 1) {
+    int frame_size = 0;
+    // Get an estimate of the encoded frame size.
+    save_coding_context(cpi);
+    vp9_pack_bitstream(cpi, dest, size);
+    restore_coding_context(cpi);
+    frame_size = (int)(*size) << 3;
+    // Check if encoded frame will overshoot too much, and if so, set the q and
+    // adjust some rate control parameters, and return to re-encode the frame.
+    if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) {
+      vpx_clear_system_state();
+      vp9_set_quantizer(cm, q);
+      vp9_set_variance_partition_thresholds(cpi, q);
+      suppress_active_map(cpi);
+      // Turn-off cyclic refresh for re-encoded frame.
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        unsigned char *const seg_map = cpi->segmentation_map;
+        memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+        vp9_disable_segmentation(&cm->seg);
+      }
+      apply_active_map(cpi);
+      vp9_encode_frame(cpi);
+    }
+  }
+
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for non-SVC 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cm->frame_type != KEY_FRAME &&
+      !cpi->use_svc &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
+    vp9_cyclic_refresh_check_golden_update(cpi);
+
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 }
 
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     size_t *size,
-                                    uint8_t *dest,
-                                    int q,
-                                    int bottom_index,
-                                    int top_index) {
+                                    uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
   int loop_count = 0;
+  int loop_at_this_size = 0;
   int loop = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-  int q_low = bottom_index, q_high = top_index;
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
 
-  // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
+  set_size_independent_vars(cpi);
 
   do {
-    vp9_clear_system_state();
+    vpx_clear_system_state();
+
+    set_frame_size(cpi);
+
+    if (loop_count == 0 || cpi->resize_pending != 0) {
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+
+      // Reconfiguration for change in frame size has concluded.
+      cpi->resize_pending = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+    }
+
+    // Decide frame size bounds first time through.
+    if (loop_count == 0) {
+      vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      vp9_scale_references(cpi);
+    }
 
     vp9_set_quantizer(cm, q);
 
@@ -1763,14 +3361,13 @@
     // seen in the last encoder iteration.
     // update_base_skip_probs(cpi);
 
-    vp9_clear_system_state();
+    vpx_clear_system_state();
 
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
-      cpi->dummy_packing = 1;
       if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
@@ -1788,10 +3385,20 @@
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        int64_t kf_err;
 
-        int high_err_target = cpi->ambient_err;
-        int low_err_target = cpi->ambient_err >> 1;
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        } else {
+          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Prevent possible divide by zero error below for perfect KF
         kf_err += !kf_err;
@@ -1806,7 +3413,7 @@
           q_high = q > q_low ? q - 1 : q_low;
 
           // Adjust Q
-          q = (q * high_err_target) / kf_err;
+          q = (int)((q * high_err_target) / kf_err);
           q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    rc->projected_frame_size >= frame_under_shoot_limit) {
@@ -1815,7 +3422,7 @@
           q_low = q < q_high ? q + 1 : q_high;
 
           // Adjust Q
-          q = (q * low_err_target) / kf_err;
+          q = (int)((q * low_err_target) / kf_err);
           q = MIN(q, (q_high + q_low + 1) >> 1);
         }
 
@@ -1831,6 +3438,20 @@
         int last_q = q;
         int retries = 0;
 
+        if (cpi->resize_pending == 1) {
+          // Change in frame size so go back around the recode loop.
+          cpi->rc.frame_size_selector =
+              SCALE_STEP1 - cpi->rc.frame_size_selector;
+          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+          ++cpi->tot_recode_hits;
+#endif
+          ++loop_count;
+          loop = 1;
+          continue;
+        }
+
         // Frame size out of permitted range:
         // Update correction factor & compute new Q to try...
 
@@ -1843,20 +3464,20 @@
           // Raise Qlow as to at least the current value
           q_low = q < q_high ? q + 1 : q_high;
 
-          if (undershoot_seen || loop_count > 1) {
+          if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
@@ -1868,11 +3489,11 @@
           // Frame is too small
           q_high = q > q_low ? q - 1 : q_low;
 
-          if (overshoot_seen || loop_count > 1) {
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+          if (overshoot_seen || loop_at_this_size > 1) {
+            vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
@@ -1885,7 +3506,7 @@
             }
 
             while (q > q_high && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
@@ -1898,7 +3519,7 @@
         // Clamp Q to upper and lower limits:
         q = clamp(q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = (q != last_q);
       } else {
         loop = 0;
       }
@@ -1910,45 +3531,38 @@
       loop = 0;
 
     if (loop) {
-      loop_count++;
+      ++loop_count;
+      ++loop_at_this_size;
 
 #if CONFIG_INTERNAL_STATS
-      cpi->tot_recode_hits++;
+      ++cpi->tot_recode_hits;
 #endif
     }
   } while (loop);
 }
 
-static void get_ref_frame_flags(VP9_COMP *cpi) {
-  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_last = 0;
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 0;
-
-  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 1;
-  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 0;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (cpi->gold_is_last)
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+  if (gold_is_last)
+    flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
-      !is_spatial_svc(cpi))
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
+    flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->alt_is_last)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+  if (alt_is_last)
+    flags &= ~VP9_ALT_FLAG;
 
-  if (cpi->gold_is_alt)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+  if (gold_is_alt)
+    flags &= ~VP9_ALT_FLAG;
+
+  return flags;
 }
 
 static void set_ext_overrides(VP9_COMP *cpi) {
@@ -1973,34 +3587,27 @@
                                           YV12_BUFFER_CONFIG *scaled) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
-    scale_and_extend_frame_nonnormative(unscaled, scaled);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (unscaled->y_width == (scaled->y_width << 1) &&
+        unscaled->y_height == (scaled->y_height << 1))
+      scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
+    else
+      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+    // Use the faster normative (convolve8) scaling filter: for now only for
+    // scaling factor of 2.
+    if (unscaled->y_width == (scaled->y_width << 1) &&
+        unscaled->y_height == (scaled->y_height << 1))
+      scale_and_extend_frame(unscaled, scaled);
+    else
+      scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
     return unscaled;
   }
 }
 
-static void configure_skippable_frame(VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-
-  SVC *const svc = &cpi->svc;
-  TWO_PASS *const twopass = is_spatial_svc(cpi) ?
-                            &svc->layer_context[svc->spatial_layer_id].twopass
-                            : &cpi->twopass;
-
-  cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
-    twopass->stats_in - 2 > twopass->stats_in_start &&
-    twopass->stats_in < twopass->stats_in_end &&
-    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
-    == 1 &&
-    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
-    == 1 &&
-    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 static void set_arf_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
@@ -2017,37 +3624,43 @@
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
+static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+  INTERP_FILTER ifilter;
+  int ref_total[MAX_REF_FRAMES] = {0};
+  MV_REFERENCE_FRAME ref;
+  int mask = 0;
+  if (cpi->common.last_frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame)
+    return mask;
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+    if ((ref_total[LAST_FRAME] &&
+        cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+        (ref_total[GOLDEN_FRAME] == 0 ||
+         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+           < ref_total[GOLDEN_FRAME]) &&
+        (ref_total[ALTREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+           < ref_total[ALTREF_FRAME]))
+      mask |= 1 << ifilter;
+  }
+  return mask;
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
   VP9_COMMON *const cm = &cpi->common;
-  TX_SIZE t;
-  int q;
-  int top_index;
-  int bottom_index;
-
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
+  TX_SIZE t;
+
   set_ext_overrides(cpi);
-
-  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-
-  if (cpi->unscaled_last_source != NULL)
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
-
-  vp9_scale_references(cpi);
-
-  vp9_clear_system_state();
-
-  // Enable or disable mode based tweaking of the zbin.
-  // For 2 pass only used where GF/ARF prediction quality
-  // is above a threshold.
-  cpi->zbin_mode_boost = 0;
-  cpi->zbin_mode_boost_enabled = 0;
+  vpx_clear_system_state();
 
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
@@ -2055,24 +3668,10 @@
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  // Initialize cpi->mv_step_param to default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->mv.auto_mv_step_size) {
-    if (frame_is_intra_only(cm)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(MIN(max_mv_def, 2 *
-                                 cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
+  if (cpi->oxcf.pass == 2 &&
+      cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
 
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {
@@ -2088,9 +3687,8 @@
     // The alternate reference frame cannot be active for a key frame.
     cpi->rc.source_alt_ref_active = 0;
 
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
 
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
@@ -2102,25 +3700,49 @@
       cm->reset_frame_context = 2;
     }
   }
+  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+    // Use context 0 for intra only empty frame, but the last frame context
+    // for other empty frames.
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
+      if (cpi->svc.encode_intra_empty_frame != 0)
+        cm->frame_context_idx = 0;
+      else
+        cm->frame_context_idx = FRAME_CONTEXTS - 1;
+    } else {
+    cm->frame_context_idx =
+        cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+        cpi->svc.temporal_layer_id;
+    }
 
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
-    configure_static_seg_features(cpi);
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
 
-  // Check if the current frame is skippable for the partition search in the
-  // second pass according to the first pass stats
-  if (cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
-    configure_skippable_frame(cpi);
+    // The probs will be updated based on the frame type of its previous
+    // frame if frame_parallel_decoding_mode is 0. The type may vary for
+    // the frame after a key frame in base layer since we may drop enhancement
+    // layers. So set frame_parallel_decoding_mode to 1 in this case.
+    if (cm->frame_parallel_decoding_mode == 0) {
+      if (cpi->svc.number_temporal_layers == 1) {
+        if (cpi->svc.spatial_layer_id == 0 &&
+            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+          cm->frame_parallel_decoding_mode = 1;
+      } else if (cpi->svc.spatial_layer_id == 0) {
+        // Find the 2nd frame in temporal base layer and 1st frame in temporal
+        // enhancement layers from the key frame.
+        int i;
+        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+            cm->frame_parallel_decoding_mode = 1;
+            break;
+          }
+        }
+      }
+    }
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_CBR &&
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (vp9_rc_drop_frame(cpi)) {
       vp9_rc_postencode_update_drop_frame(cpi);
@@ -2129,65 +3751,47 @@
     }
   }
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    int l = 0;
-    switch (cpi->oxcf.noise_sensitivity) {
-      case 1:
-        l = 20;
-        break;
-      case 2:
-        l = 40;
-        break;
-      case 3:
-        l = 60;
-        break;
-      case 4:
-      case 5:
-        l = 100;
-        break;
-      case 6:
-        l = 150;
-        break;
-    }
-    vp9_denoise(cpi->Source, cpi->Source, l);
-  }
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
-  set_speed_features(cpi);
-
-  // Decide q and q bounds.
-  q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
-
-  if (!frame_is_intra_only(cm)) {
-    cm->interp_filter = cpi->sf.default_interp_filter;
-    /* TODO: Decide this more intelligently */
-    vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
-  }
-
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, q);
+    encode_without_recode_loop(cpi, size, dest);
   } else {
-    encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+    encode_with_recode_loop(cpi, size, dest);
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (oxcf->noise_sensitivity > 0) {
     vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                             yuv_denoised_file);
   }
 #endif
 #endif
-
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+                                              get_frame_new_buffer(cm));
+    } else {
+      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    }
+#else
     cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // If the encoder forced a KEY_FRAME decision
@@ -2196,39 +3800,23 @@
 
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
   // build the bitstream
-  cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
   if (cm->seg.update_map)
     update_reference_segmentation_map(cpi);
 
-  release_scaled_references(cpi);
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
   vp9_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
+    full_to_model_counts(cpi->td.counts->coef[t],
+                         cpi->td.rd_counts.coef_counts[t]);
 
   if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
     vp9_adapt_coef_probs(cm);
@@ -2250,10 +3838,12 @@
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-  get_ref_frame_flags(cpi);
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
   cm->last_frame_type = cm->frame_type;
-  vp9_rc_postencode_update(cpi, *size);
+
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_rc_postencode_update(cpi, *size);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -2282,13 +3872,19 @@
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
-
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
     if (cpi->use_svc)
-      vp9_inc_frame_in_layer(&cpi->svc);
+      vp9_inc_frame_in_layer(cpi);
   }
+  cm->prev_frame = cm->cur_frame;
+
+  if (cpi->use_svc)
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                           cpi->svc.number_temporal_layers +
+                           cpi->svc.temporal_layer_id].last_frame_type =
+                               cm->frame_type;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -2310,42 +3906,67 @@
 static void Pass2Encode(VP9_COMP *cpi, size_t *size,
                         uint8_t *dest, unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-
-  vp9_rc_get_second_pass_params(cpi);
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
-  vp9_twopass_postencode_update(cpi);
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_twopass_postencode_update(cpi);
 }
 
-static void init_motion_estimation(VP9_COMP *cpi) {
-  int y_stride = cpi->scaled_source.y_stride;
-
-  if (cpi->sf.mv.search_method == NSTEP) {
-    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
-  } else if (cpi->sf.mv.search_method == DIAMOND) {
-    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
   }
 }
 
-static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
-                                int subsampling_y) {
+static void check_initial_width(VP9_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
 
-  if (!cpi->initial_width) {
+  if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
 
     alloc_raw_frame_buffers(cpi);
-    alloc_ref_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
     alloc_util_frame_buffers(cpi);
 
-    init_motion_estimation(cpi);
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
   }
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
 
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -2353,22 +3974,25 @@
   VP9_COMMON *cm = &cpi->common;
   struct vpx_usec_timer timer;
   int res = 0;
-  const int subsampling_x = sd->uv_width  < sd->y_width;
-  const int subsampling_y = sd->uv_height < sd->y_height;
-
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
   check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
   vpx_usec_timer_start(&timer);
 
-#if CONFIG_SPATIAL_SVC
-  if (is_spatial_svc(cpi))
-    res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
-                                 frame_flags);
-  else
-#endif
-    res = vp9_lookahead_push(cpi->lookahead,
-                             sd, time_stamp, end_time, frame_flags);
-  if (res)
+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                         frame_flags))
     res = -1;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -2376,13 +4000,13 @@
   if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "Non-4:2:0 color space requires profile 1 or 3");
+                       "Non-4:2:0 color format requires profile 1 or 3");
     res = -1;
   }
   if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
       (subsampling_x == 1 && subsampling_y == 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "4:2:0 color space requires profile 0 or 2");
+                       "4:2:0 color format requires profile 0 or 2");
     res = -1;
   }
 
@@ -2403,18 +4027,19 @@
          cm->seg.update_data;
 }
 
-void adjust_frame_rate(VP9_COMP *cpi) {
+static void adjust_frame_rate(VP9_COMP *cpi,
+                              const struct lookahead_entry *source) {
   int64_t this_duration;
   int step = 0;
 
-  if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
-    this_duration = cpi->source->ts_end - cpi->source->ts_start;
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
     step = 1;
   } else {
     int64_t last_duration = cpi->last_end_time_stamp_seen
         - cpi->last_time_stamp_seen;
 
-    this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
 
     // do a step update if the duration changes by 10%
     if (last_duration)
@@ -2428,17 +4053,17 @@
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
-      const double interval = MIN((double)(cpi->source->ts_end
+      const double interval = MIN((double)(source->ts_end
                                    - cpi->first_time_stamp_ever), 10000000.0);
-      double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+      double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
 
       vp9_new_framerate(cpi, 10000000.0 / avg_duration);
     }
   }
-  cpi->last_time_stamp_seen = cpi->source->ts_start;
-  cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
 }
 
 // Returns 0 if this is not an alt ref else the offset of the source frame
@@ -2459,7 +4084,8 @@
   return arf_src_index;
 }
 
-static void check_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi,
+                             const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   if (cpi->oxcf.pass == 2) {
@@ -2468,7 +4094,7 @@
       (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
   } else {
     rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
-                               (cpi->source == cpi->alt_ref_source);
+                               (source == cpi->alt_ref_source);
   }
 
   if (rc->is_src_frame_alt_ref) {
@@ -2481,62 +4107,89 @@
   }
 }
 
+#if CONFIG_INTERNAL_STATS
+extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                                 const uint8_t *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[Y] += y;
+  s->stat[U] += u;
+  s->stat[V] += v;
+  s->stat[ALL] += all;
+  s->worst = MIN(s->worst, all);
+}
+#endif  // CONFIG_INTERNAL_STATS
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  BufferPool *const pool = cm->buffer_pool;
   RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
-  MV_REFERENCE_FRAME ref_frame;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
   int arf_src_index;
+  int i;
 
-  if (!cpi)
-    return -1;
-
-  if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) {
+  if (is_two_pass_svc(cpi)) {
 #if CONFIG_SPATIAL_SVC
-    vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
+    vp9_svc_start_frame(cpi);
+    // Use a small empty frame instead of a real frame
+    if (cpi->svc.encode_empty_frame_state == ENCODING)
+      source = &cpi->svc.empty_frame;
 #endif
-    vp9_restore_layer_context(cpi);
+    if (oxcf->pass == 2)
+      vp9_restore_layer_context(cpi);
+  } else if (is_one_pass_cbr_svc(cpi)) {
+    vp9_one_pass_cbr_svc_start_layer(cpi);
   }
 
   vpx_usec_timer_start(&cmptimer);
 
-  cpi->source = NULL;
-  cpi->last_source = NULL;
-
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR and
+  // will not work properly with svc.
+  if ((oxcf->pass == 2) && !cpi->use_svc &&
+      (cpi->oxcf.enable_auto_arf > 1))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
-  cpi->refresh_last_frame = 1;
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
+  if (!is_one_pass_cbr_svc(cpi)) {
+    cpi->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+  }
 
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
+
+  // Skip alt frame if we encode the empty frame
+  if (is_two_pass_svc(cpi) && source != NULL)
+    arf_src_index = 0;
+
   if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
-#if CONFIG_SPATIAL_SVC
-    if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
-                                           arf_src_index, 0);
-    else
-#endif
-      cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
-    if (cpi->source != NULL) {
-      cpi->alt_ref_source = cpi->source;
+    if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
 
 #if CONFIG_SPATIAL_SVC
-      if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
+      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
         int i;
         // Reference a hidden frame from a lower layer
         for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (cpi->oxcf.ss_play_alternate[i]) {
+          if (oxcf->ss_enable_auto_arf[i]) {
             cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
             break;
           }
@@ -2545,14 +4198,15 @@
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
 
-      if (cpi->oxcf.arnr_max_frames > 0) {
+      if (oxcf->arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
-        vp9_extend_frame_borders(&cpi->alt_ref_buffer);
+        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
       cm->show_frame = 0;
+      cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
@@ -2563,144 +4217,132 @@
     }
   }
 
-  if (!cpi->source) {
+  if (!source) {
     // Get last frame source.
     if (cm->current_video_frame > 0) {
-#if CONFIG_SPATIAL_SVC
-      if (is_spatial_svc(cpi))
-        cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
-      else
-#endif
-        cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
-      if (cpi->last_source == NULL)
+      if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
         return -1;
     }
 
     // Read in the source frame.
-#if CONFIG_SPATIAL_SVC
-    if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+    if (cpi->use_svc)
+      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
-#endif
-      cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
-    if (cpi->source != NULL) {
+      source = vp9_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
+      // if the flags indicate intra frame, but if the current picture is for
+      // non-zero spatial layer, it should not be an intra picture.
+      // TODO(Won Kap): this needs to change if per-layer intra frame is
+      // allowed.
+      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id) {
+        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+      }
 
       // Check to see if the frame should be encoded as an arf overlay.
-      check_src_altref(cpi);
+      check_src_altref(cpi, source);
     }
   }
 
-  if (cpi->source) {
+  if (source) {
     cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
-                                                           : &cpi->source->img;
+                                                           : &source->img;
 
-    if (cpi->last_source != NULL) {
-      cpi->unscaled_last_source = &cpi->last_source->img;
-    } else {
-      cpi->unscaled_last_source = NULL;
-    }
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
 
-    *time_stamp = cpi->source->ts_start;
-    *time_end = cpi->source->ts_end;
-    *frame_flags =
-        (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   } else {
     *size = 0;
-    if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
       vp9_end_first_pass(cpi);    /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
     return -1;
   }
 
-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
   }
 
   // Clear down mmx registers
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
-    adjust_frame_rate(cpi);
+    adjust_frame_rate(cpi, source);
   }
 
-  if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) {
+  if (is_one_pass_cbr_svc(cpi)) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
 
-  // start with a 0 size frame
-  *size = 0;
-
-  /* find a free buffer for the new frame, releasing the reference previously
-   * held.
-   */
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
   cm->new_fb_idx = get_free_fb(cm);
 
+  if (cm->new_fb_idx == INVALID_IDX)
+    return -1;
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
-    } else if (cpi->oxcf.pass == 2) {
+    } else if (oxcf->pass == 2) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
     }
   }
 
+  // Start with a 0 size frame.
+  *size = 0;
+
   cpi->frame_flags = *frame_flags;
 
-  if (cpi->oxcf.pass == 2 &&
-      cm->current_video_frame == 0 &&
-      cpi->oxcf.allow_spatial_resampling &&
-      cpi->oxcf.rc_mode == VPX_VBR) {
-    // Internal scaling is triggered on the first frame.
-    vp9_set_size_literal(cpi, cpi->oxcf.scaled_frame_width,
-                         cpi->oxcf.scaled_frame_height);
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_rc_get_second_pass_params(cpi);
+  } else if (oxcf->pass == 1) {
+    set_frame_size(cpi);
   }
 
-  // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
-                           cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
-
-  alloc_util_frame_buffers(cpi);
-  init_motion_estimation(cpi);
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
-    ref_buf->buf = buf;
-    ref_buf->idx = idx;
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height);
-
-    if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf);
+  if (cpi->oxcf.pass != 0 ||
+      cpi->use_svc ||
+      frame_is_intra_only(cm) == 1) {
+    for (i = 0; i < MAX_REF_FRAMES; ++i)
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
-  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    vp9_vaq_init();
-  }
-
-  if (cpi->oxcf.pass == 1 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
-    const int lossless = is_lossless_requested(&cpi->oxcf);
-    cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-    cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-    vp9_first_pass(cpi);
-  } else if (cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
+  if (oxcf->pass == 1 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->oxcf.use_highbitdepth)
+      cpi->td.mb.fwd_txm4x4 = lossless ?
+          vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+    else
+      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+                                         vp9_highbd_idct4x4_add;
+#else
+    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    vp9_first_pass(cpi, source);
+  } else if (oxcf->pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -2710,10 +4352,10 @@
   }
 
   if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
-  // Frame was dropped, release scaled references.
-  if (*size == 0) {
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
     release_scaled_references(cpi);
   }
 
@@ -2722,21 +4364,23 @@
   }
 
   // Save layer specific state.
-  if ((cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+  if (is_one_pass_cbr_svc(cpi) ||
+        ((cpi->svc.number_temporal_layers > 1 ||
+          cpi->svc.number_spatial_layers > 1) &&
+         oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
-  if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame)
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
 #if CONFIG_INTERNAL_STATS
 
-  if (cpi->oxcf.pass != 1) {
+  if (oxcf->pass != 1) {
+    double samples = 0.0;
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
@@ -2747,41 +4391,77 @@
         YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
         YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
         PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
+                         cpi->oxcf.input_bit_depth);
+#else
         calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-        cpi->total += psnr.psnr[0];
-        cpi->total_y += psnr.psnr[1];
-        cpi->total_u += psnr.psnr[2];
-        cpi->total_v += psnr.psnr[3];
+        adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+                          psnr.psnr[0], &cpi->psnr);
         cpi->total_sq_error += psnr.sse[0];
         cpi->total_samples += psnr.samples[0];
+        samples = psnr.samples[0];
 
         {
           PSNR_STATS psnr2;
           double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
-          // TODO(agrange) Add resizing of post-proc buffer in here when the
-          // encoder is changed to use on-demand buffer allocation.
+          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
+                                     recon->y_crop_width, recon->y_crop_height,
+                                     cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
+                                     VP9_ENC_BORDER_IN_PIXELS,
+                                     cm->byte_alignment) < 0) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate post processing buffer");
+          }
+
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
-          vp9_clear_system_state();
+          vpx_clear_system_state();
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
+                           cpi->oxcf.input_bit_depth);
+#else
           calc_psnr(orig, pp, &psnr2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-          cpi->totalp += psnr2.psnr[0];
-          cpi->totalp_y += psnr2.psnr[1];
-          cpi->totalp_u += psnr2.psnr[2];
-          cpi->totalp_v += psnr2.psnr[3];
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
+          adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+                            psnr2.psnr[0], &cpi->psnrp);
 
-          frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
+                                               (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
+          cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
-          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(
+                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
@@ -2796,19 +4476,101 @@
 #endif
         }
       }
+      if (cpi->b_calculate_blockiness) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double frame_blockiness = vp9_get_blockiness(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height);
+          cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness);
+          cpi->total_blockiness += frame_blockiness;
+        }
+      }
+
+      if (cpi->b_calculate_consistency) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double this_inconsistency = vpx_get_ssim_metrics(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+              &cpi->metrics, 1);
+
+          const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+          double consistency = vpx_sse_to_psnr(samples, peak,
+                                             (double)cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency = MIN(cpi->worst_consistency,
+                                         consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
 
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
-        frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
-        cpi->total_ssimg_y += y;
-        cpi->total_ssimg_u += u;
-        cpi->total_ssimg_v += v;
-        cpi->total_ssimg_all += frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+                                            &u, &v, (int)cm->bit_depth);
+        } else {
+          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+                                     &v);
+        }
+#else
+        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+                                      &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+        /* TODO(JBB): add 10/12 bit support */
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
       }
     }
   }
 
 #endif
+
+  if (is_two_pass_svc(cpi)) {
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
+      cpi->svc.encode_empty_frame_state = ENCODED;
+      cpi->svc.encode_intra_empty_frame = 0;
+    }
+
+    if (cm->show_frame) {
+      ++cpi->svc.spatial_layer_to_encode;
+      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+        cpi->svc.spatial_layer_to_encode = 0;
+
+      // May need the empty frame after an visible frame.
+      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
+    }
+  } else if (is_one_pass_cbr_svc(cpi)) {
+    if (cm->show_frame) {
+      ++cpi->svc.spatial_layer_to_encode;
+      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+        cpi->svc.spatial_layer_to_encode = 0;
+    }
+  }
+  vpx_clear_system_state();
   return 0;
 }
 
@@ -2837,34 +4599,11 @@
       ret = -1;
     }
 #endif  // !CONFIG_VP9_POSTPROC
-    vp9_clear_system_state();
+    vpx_clear_system_state();
     return ret;
   }
 }
 
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
-    const int mi_rows = cpi->common.mi_rows;
-    const int mi_cols = cpi->common.mi_cols;
-    if (map) {
-      int r, c;
-      for (r = 0; r < mi_rows; r++) {
-        for (c = 0; c < mi_cols; c++) {
-          cpi->segmentation_map[r * mi_cols + c] =
-              !map[(r >> 1) * cols + (c >> 1)];
-        }
-      }
-      vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP);
-      vp9_enable_segmentation(&cpi->common.seg);
-    } else {
-      vp9_disable_segmentation(&cpi->common.seg);
-    }
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
 int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP9_COMMON *cm = &cpi->common;
@@ -2890,15 +4629,18 @@
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+#else
   check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
 
   if (width) {
     cm->width = width;
-    if (cm->width * 5 < cpi->initial_width) {
-      cm->width = cpi->initial_width / 5 + 1;
-      printf("Warning: Desired width too small, changed to %d\n", cm->width);
-    }
     if (cm->width > cpi->initial_width) {
       cm->width = cpi->initial_width;
       printf("Warning: Desired width too large, changed to %d\n", cm->width);
@@ -2907,10 +4649,6 @@
 
   if (height) {
     cm->height = height;
-    if (cm->height * 5 < cpi->initial_height) {
-      cm->height = cpi->initial_height / 5 + 1;
-      printf("Warning: Desired height too small, changed to %d\n", cm->height);
-    }
     if (cm->height > cpi->initial_height) {
       cm->height = cpi->initial_height;
       printf("Warning: Desired height too large, changed to %d\n", cm->height);
@@ -2929,14 +4667,27 @@
   return;
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
 
-  return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                      a->y_crop_width, a->y_crop_height);
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;

diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index c841da2..c10abd2 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h

@@ -14,13 +14,18 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
@@ -36,7 +41,7 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
@@ -45,14 +50,12 @@
 extern "C" {
 #endif
 
-#define DEFAULT_GF_INTERVAL         10
-
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+  vpx_prob segment_pred_probs[PREDICTION_PROBS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -82,36 +85,19 @@
 } VPX_SCALING;
 
 typedef enum {
-  // Good Quality Fast Encoding. The encoder balances quality with the
-  // amount of time it takes to encode the output. (speed setting
-  // controls how fast)
-  ONE_PASS_GOOD = 1,
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
 
-  // One Pass - Best Quality. The encoder places priority on the
-  // quality of the output over encoding speed. The output is compressed
-  // at the highest possible quality. This option takes the longest
-  // amount of time to encode. (speed setting ignored)
-  ONE_PASS_BEST = 2,
+  // The encoder places priority on the quality of the output over encoding
+  // speed. The output is compressed at the highest possible quality. This
+  // option takes the longest amount of time to encode. Speed setting ignored.
+  BEST,
 
-  // Two Pass - First Pass. The encoder generates a file of statistics
-  // for use in the second encoding pass. (speed setting controls how fast)
-  TWO_PASS_FIRST = 3,
-
-  // Two Pass - Second Pass. The encoder uses the statistics that were
-  // generated in the first encoding pass to create the compressed
-  // output. (speed setting controls how fast)
-  TWO_PASS_SECOND_GOOD = 4,
-
-  // Two Pass - Second Pass Best.  The encoder uses the statistics that
-  // were generated in the first encoding pass to create the compressed
-  // output using the highest possible quality, and taking a
-  // longer amount of time to encode. (speed setting ignored)
-  TWO_PASS_SECOND_BEST = 5,
-
-  // Realtime/Live Encoding. This mode is optimized for realtime
-  // encoding (for example, capturing a television signal or feed from
-  // a live camera). (speed setting controls how fast)
-  REALTIME = 6,
+  // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+  // example, capturing a television signal or feed from a live camera). Speed
+  // setting controls how fast.
+  REALTIME
 } MODE;
 
 typedef enum {
@@ -128,19 +114,30 @@
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed (except for SVC).
+  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
+  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
 
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
-  BIT_DEPTH bit_depth;
+  vpx_bit_depth_t bit_depth;     // Codec bit-depth.
   int width;  // width of data passed to the compressor
   int height;  // height of data passed to the compressor
-  double framerate;  // set to passed in framerate
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;  // set to passed in framerate
   int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
   int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
 
   MODE mode;
   int pass;
@@ -177,7 +174,7 @@
   AQ_MODE aq_mode;  // Adaptive Quantization mode
 
   // Internal frame size scaling.
-  int allow_spatial_resampling;
+  RESIZE_TYPE resize_mode;
   int scaled_frame_width;
   int scaled_frame_height;
 
@@ -195,14 +192,13 @@
   int ss_number_layers;  // Number of spatial layers.
   int ts_number_layers;  // Number of temporal layers.
   // Bitrate allocation for spatial layers.
+  int layer_target_bitrate[VPX_MAX_LAYERS];
   int ss_target_bitrate[VPX_SS_MAX_LAYERS];
-  int ss_play_alternate[VPX_SS_MAX_LAYERS];
+  int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
   // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
-  int ts_target_bitrate[VPX_TS_MAX_LAYERS];
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 
-  // these parameters aren't to be used in final build don't use!!!
-  int play_alternate;
+  int enable_auto_arf;
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -220,39 +216,88 @@
 
   int arnr_max_frames;
   int arnr_strength;
-  int arnr_type;
+
+  int min_gf_interval;
+  int max_gf_interval;
 
   int tile_columns;
   int tile_rows;
 
-  struct vpx_fixed_buf         two_pass_stats_in;
-  struct vpx_codec_pkt_list  *output_pkt_list;
+  int max_threads;
+
+  vpx_fixed_buf_t two_pass_stats_in;
+  struct vpx_codec_pkt_list *output_pkt_list;
 
 #if CONFIG_FP_MB_STATS
-  struct vpx_fixed_buf         firstpass_mb_stats_in;
+  vpx_fixed_buf_t firstpass_mb_stats_in;
 #endif
 
   vp8e_tuning tuning;
   vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
+  vpx_color_space_t color_space;
+  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
-static INLINE int is_best_mode(MODE mode) {
-  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
-}
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+typedef enum {
+  Y,
+  U,
+  V,
+  ALL
+} STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+  double stat[ALL+1];
+  double worst;
+} ImageStat;
 
 typedef struct VP9_COMP {
   QUANTS quants;
-  MACROBLOCK mb;
+  ThreadData td;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
   VP9_COMMON common;
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
-  struct lookahead_entry  *source;
   struct lookahead_entry  *alt_ref_source;
-  struct lookahead_entry  *last_source;
 
   YV12_BUFFER_CONFIG *Source;
   YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
@@ -261,13 +306,13 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt same as last ( short circuit altref search)
-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
 
-  int skippable_frame;
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
 
-  int scaled_ref_idx[3];
+  int scaled_ref_idx[MAX_REF_FRAMES];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
@@ -286,28 +331,29 @@
 
   YV12_BUFFER_CONFIG last_frame_uf;
 
-  TOKENEXTRA *tok;
+  TOKENEXTRA *tile_tok[4][1 << 6];
   unsigned int tok_count[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
-  int ambient_err;
+  int64_t ambient_err;
 
   RD_OPT rd;
 
   CODING_CONTEXT coding_context;
 
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
-  int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength
+  int *nmvcosts[2];
+  int *nmvcosts_hp[2];
+  int *nmvsadcosts[2];
+  int *nmvsadcosts_hp[2];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
 
   RATE_CONTROL rc;
+  double framerate;
 
-  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
 
   struct vpx_codec_pkt_list  *output_pkt_list;
 
@@ -321,6 +367,8 @@
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
+  int allow_comp_inter_inter;
+
   // Default value is 1. From first pass stats, encode_breakout may be disabled.
   ENCODE_BREAKOUT_TYPE allow_encode_breakout;
 
@@ -333,13 +381,11 @@
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
 
-  unsigned char *complexity_map;
-
   CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
   vp9_full_search_fn_t full_search_sad;
-  vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -354,25 +400,22 @@
   TWO_PASS twopass;
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
 
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
 
   int    count;
-  double total_y;
-  double total_u;
-  double total_v;
-  double total;
   uint64_t total_sq_error;
   uint64_t total_samples;
+  ImageStat psnr;
 
-  double totalp_y;
-  double totalp_u;
-  double totalp_v;
-  double totalp;
   uint64_t totalp_sq_error;
   uint64_t totalp_samples;
+  ImageStat psnrp;
+
+  double total_blockiness;
+  double worst_blockiness;
 
   int    bytes;
   double summed_quality;
@@ -380,25 +423,32 @@
   double summedp_quality;
   double summedp_weights;
   unsigned int tot_recode_hits;
+  double worst_ssim;
 
-
-  double total_ssimg_y;
-  double total_ssimg_u;
-  double total_ssimg_v;
-  double total_ssimg_all;
+  ImageStat ssimg;
+  ImageStat fastssim;
+  ImageStat psnrhvs;
 
   int b_calculate_ssimg;
+  int b_calculate_blockiness;
+
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
 #endif
   int b_calculate_psnr;
 
   int droppable;
 
-  int dummy_packing;    /* flag to indicate if packing is dummy */
-
-  unsigned int tx_stepdown_count[TX_SIZES];
-
   int initial_width;
   int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
 
   int use_svc;
 
@@ -415,14 +465,10 @@
   search_site_config ss_cfg;
 
   int mbmode_cost[INTRA_MODES];
-  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
-  PICK_MODE_CONTEXT *leaf_tree;
-  PC_TREE *pc_tree;
-  PC_TREE *pc_root;
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   int multi_arf_allowed;
@@ -432,11 +478,34 @@
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
+
+  int resize_pending;
+  int resize_state;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[4];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  BLOCK_SIZE vbp_bsize_min;
+
+  // Multi-threading
+  int num_workers;
+  VPxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
 } VP9_COMP;
 
-void vp9_initialize_enc();
+void vp9_initialize_enc(void);
 
-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                       BufferPool *const pool);
 void vp9_remove_compressor(VP9_COMP *cpi);
 
 void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -468,6 +537,8 @@
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
 
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
 int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
 
@@ -478,8 +549,14 @@
 
 int vp9_get_quantizer(struct VP9_COMP *cpi);
 
-static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
-                                    MV_REFERENCE_FRAME ref_frame) {
+static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
     return cpi->lst_fb_idx;
   } else if (ref_frame == GOLDEN_FRAME) {
@@ -489,19 +566,19 @@
   }
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
-    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON * const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
-      .buf;
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+                                        int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
-// Intra only frames, golden frames (except alt ref overlays) and
-// alt ref frames tend to be coded at a higher than ambient quality
-static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
-         vp9_is_upper_layer_key_frame(cpi);
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
@@ -513,7 +590,20 @@
   return mb_rows * mb_cols * (16 * 16 * 3 + 4);
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
@@ -521,8 +611,6 @@
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
-int64_t vp9_rescale(int64_t val, int64_t num, int denom);
-
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
@@ -531,17 +619,19 @@
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) {
-  return cpi->use_svc &&
-         cpi->svc.number_temporal_layers == 1 &&
-         cpi->svc.number_spatial_layers > 1;
+static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
+  return cpi->use_svc && cpi->oxcf.pass != 0;
+}
+
+static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+  return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
-         (cpi->oxcf.play_alternate &&
-          (!is_spatial_svc(cpi) ||
-           cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
+         (cpi->oxcf.enable_auto_arf &&
+          (!is_two_pass_svc(cpi) ||
+           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
 }
 
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -557,6 +647,14 @@
   return frame_index & 0x1;
 }
 
+static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c
new file mode 100644
index 0000000..00025b7
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_ethread.c

@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int t;
+
+  (void) unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+      t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+static int get_max_tile_cols(VP9_COMP *cpi) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
+  int mi_cols = aligned_width >> MI_SIZE_LOG2;
+  int min_log2_tile_cols, max_log2_tile_cols;
+  int log2_tile_cols;
+
+  vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+  log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                   min_log2_tile_cols, max_log2_tile_cols);
+  return (1 << log2_tile_cols);
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    int allocated_workers = num_workers;
+
+    // While using SVC, we need to allocate threads according to the highest
+    // resolution.
+    if (cpi->use_svc) {
+      int max_tile_cols = get_max_tile_cols(cpi);
+      allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols);
+    }
+
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(allocated_workers,
+                    sizeof(*cpi->tile_thr_data)));
+
+    for (i = 0; i < allocated_workers; i++) {
+      VPxWorker *const worker = &cpi->workers[i];
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+      ++cpi->num_workers;
+      winterface->init(worker);
+
+      if (i < allocated_workers - 1) {
+        thread_data->cpi = cpi;
+
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        vpx_memalign(32, sizeof(*thread_data->td)));
+        vp9_zero(*thread_data->td);
+
+        // Set up pc_tree.
+        thread_data->td->leaf_tree = NULL;
+        thread_data->td->pc_tree = NULL;
+        vp9_setup_pc_tree(cm, thread_data->td);
+
+        // Allocate frame counters in thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Create threads
+        if (!winterface->reset(worker))
+          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->cpi = cpi;
+        thread_data->td = &cpi->td;
+      }
+
+      winterface->sync(worker);
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data;
+
+    worker->hook = (VPxWorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}

diff --git a/libvpx/vp9/encoder/vp9_ethread.h b/libvpx/vp9/encoder/vp9_ethread.h
new file mode 100644
index 0000000..e87c50b
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_ethread.h

@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ETHREAD_H_
+#define VP9_ENCODER_VP9_ETHREAD_H_
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct VP9_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+#endif  // VP9_ENCODER_VP9_ETHREAD_H_

diff --git a/libvpx/vp9/encoder/vp9_extend.c b/libvpx/vp9/encoder/vp9_extend.c
index e8517c8..0c304dc 100644
--- a/libvpx/vp9/encoder/vp9_extend.c
+++ b/libvpx/vp9/encoder/vp9_extend.c

@@ -9,6 +9,7 @@
  */
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_extend.h"
@@ -27,9 +28,9 @@
   uint8_t *dst_ptr2 = dst + w;
 
   for (i = 0; i < h; i++) {
-    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);
-    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_pitch;
     src_ptr2 += src_pitch;
     dst_ptr1 += dst_pitch;
@@ -45,16 +46,62 @@
   linesize = extend_left + extend_right + w;
 
   for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    memcpy(dst_ptr1, src_ptr1, linesize);
     dst_ptr1 += dst_pitch;
   }
 
   for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    memcpy(dst_ptr2, src_ptr2, linesize);
     dst_ptr2 += dst_pitch;
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch,
+                                         int w, int h,
+                                         int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
   // Extend src frame in buffer
@@ -64,10 +111,10 @@
   // Motion estimation may use src block variance with the block size up
   // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
-  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
-                       16);
-  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
-                       16);
+  const int er_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
+      - src->y_crop_width;
+  const int eb_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
+      - src->y_crop_height;
   const int uv_width_subsampling = (src->uv_width != src->y_width);
   const int uv_height_subsampling = (src->uv_height != src->y_height);
   const int et_uv = et_y >> uv_height_subsampling;
@@ -75,19 +122,39 @@
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+                                 dst->y_buffer, dst->y_stride,
+                                 src->y_crop_width, src->y_crop_height,
+                                 et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                                 dst->u_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                                 dst->v_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   copy_and_extend_plane(src->y_buffer, src->y_stride,
                         dst->y_buffer, dst->y_stride,
-                        src->y_width, src->y_height,
+                        src->y_crop_width, src->y_crop_height,
                         et_y, el_y, eb_y, er_y);
 
   copy_and_extend_plane(src->u_buffer, src->uv_stride,
                         dst->u_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
+                        src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv);
 
   copy_and_extend_plane(src->v_buffer, src->uv_stride,
                         dst->v_buffer, dst->uv_stride,
-                        src->uv_width, src->uv_height,
+                        src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv);
 }
 

diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 295e437..e0c5966 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c

@@ -12,16 +12,18 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
@@ -33,41 +35,42 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
 
-#define OUTPUT_FPF 0
+#define OUTPUT_FPF          0
+#define ARF_STATS_OUTPUT    0
 
-#define IIFACTOR   12.5
-#define IIKFACTOR1 12.5
-#define IIKFACTOR2 15.0
-#define RMAX       512.0
-#define GF_RMAX    96.0
-#define ERR_DIVISOR   150.0
-#define MIN_DECAY_FACTOR 0.1
-#define SVC_FACTOR_PT_LOW 0.45
-#define FACTOR_PT_LOW 0.5
-#define FACTOR_PT_HIGH 0.9
+#define GROUP_ADAPTIVE_MAXQ 1
 
-#define KF_MB_INTRA_MIN 150
-#define GF_MB_INTRA_MIN 100
+#define BOOST_BREAKOUT      12.5
+#define BOOST_FACTOR        12.5
+#define ERR_DIVISOR         128.0
+#define FACTOR_PT_LOW       0.70
+#define FACTOR_PT_HIGH      0.90
+#define FIRST_PASS_Q        10.0
+#define GF_MAX_BOOST        96.0
+#define INTRA_MODE_PENALTY  1024
+#define KF_MAX_BOOST        128.0
+#define MIN_ARF_GF_BOOST    240
+#define MIN_DECAY_FACTOR    0.01
+#define MIN_KF_BOOST        300
+#define NEW_MV_MODE_PENALTY 32
+#define SVC_FACTOR_PT_LOW   0.45
+#define DARK_THRESH         64
+#define DEFAULT_GRP_WEIGHT  1.0
+#define RC_FACTOR_MIN       0.75
+#define RC_FACTOR_MAX       1.75
+
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
-#define MIN_KF_BOOST        300
-#define MIN_GF_INTERVAL     4
-
-static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
-  YV12_BUFFER_CONFIG temp = *a;
-  *a = *b;
-  *b = temp;
-}
-
-static int gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
@@ -76,16 +79,6 @@
   p->stats_in = position;
 }
 
-static int lookup_next_frame_stats(const TWO_PASS *p,
-                                   FIRSTPASS_STATS *next_frame) {
-  if (p->stats_in >= p->stats_in_end)
-    return EOF;
-
-  *next_frame = *p->stats_in;
-  return 1;
-}
-
-
 // Read frame stats at an offset from the current position.
 static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
   if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
@@ -119,10 +112,11 @@
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
-            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
-            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+    fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
             stats->frame,
+            stats->weight,
             stats->intra_error,
             stats->coded_error,
             stats->sr_coded_error,
@@ -130,6 +124,9 @@
             stats->pcnt_motion,
             stats->pcnt_second_ref,
             stats->pcnt_neutral,
+            stats->intra_skip_pct,
+            stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
             stats->MVr,
             stats->mvr_abs,
             stats->MVc,
@@ -151,13 +148,14 @@
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
   pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t);
+  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
   vpx_codec_pkt_list_add(pktlist, &pkt);
 }
 #endif
 
 static void zero_stats(FIRSTPASS_STATS *section) {
-  section->frame      = 0.0;
+  section->frame = 0.0;
+  section->weight = 0.0;
   section->intra_error = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
@@ -165,7 +163,10 @@
   section->pcnt_motion  = 0.0;
   section->pcnt_second_ref = 0.0;
   section->pcnt_neutral = 0.0;
-  section->MVr        = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
   section->mvr_abs     = 0.0;
   section->MVc        = 0.0;
   section->mvc_abs     = 0.0;
@@ -181,6 +182,7 @@
 static void accumulate_stats(FIRSTPASS_STATS *section,
                              const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
+  section->weight += frame->weight;
   section->spatial_layer_id = frame->spatial_layer_id;
   section->intra_error += frame->intra_error;
   section->coded_error += frame->coded_error;
@@ -189,7 +191,10 @@
   section->pcnt_motion += frame->pcnt_motion;
   section->pcnt_second_ref += frame->pcnt_second_ref;
   section->pcnt_neutral += frame->pcnt_neutral;
-  section->MVr        += frame->MVr;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
   section->mvr_abs     += frame->mvr_abs;
   section->MVc        += frame->MVc;
   section->mvc_abs     += frame->mvc_abs;
@@ -204,6 +209,7 @@
 static void subtract_stats(FIRSTPASS_STATS *section,
                            const FIRSTPASS_STATS *frame) {
   section->frame -= frame->frame;
+  section->weight -= frame->weight;
   section->intra_error -= frame->intra_error;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
@@ -211,7 +217,10 @@
   section->pcnt_motion -= frame->pcnt_motion;
   section->pcnt_second_ref -= frame->pcnt_second_ref;
   section->pcnt_neutral -= frame->pcnt_neutral;
-  section->MVr        -= frame->MVr;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
   section->mvr_abs     -= frame->mvr_abs;
   section->MVc        -= frame->MVc;
   section->mvc_abs     -= frame->mvc_abs;
@@ -223,17 +232,42 @@
   section->duration   -= frame->duration;
 }
 
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const VP9_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame) {
+  double active_pct;
+
+  active_pct = 1.0 -
+    ((this_frame->intra_skip_pct / 2) +
+     ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(const TWO_PASS *twopass,
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const TWO_PASS *twopass,
                                      const VP9EncoderConfig *oxcf,
                                      const FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_err = stats->coded_error / stats->count;
-  const double modified_error = av_err *
-      pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err),
-          oxcf->two_pass_vbrbias / 100.0);
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+    av_err * pow(this_frame->coded_error * this_frame->weight /
+                 DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+    pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
 }
@@ -256,7 +290,7 @@
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
     int i;
     for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
       output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
@@ -267,16 +301,16 @@
   }
 }
 
-static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_8X8:
-      return vp9_mse8x8;
+      return vpx_mse8x8;
     case BLOCK_16X8:
-      return vp9_mse16x8;
+      return vpx_mse16x8;
     case BLOCK_8X16:
-      return vp9_mse8x16;
+      return vpx_mse8x16;
     default:
-      return vp9_mse16x16;
+      return vpx_mse16x16;
   }
 }
 
@@ -284,16 +318,70 @@
                                          const struct buf_2d *src,
                                          const struct buf_2d *ref) {
   unsigned int sse;
-  const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_8_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_8_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_8_mse8x16;
+        default:
+          return vpx_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_10_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_10_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_10_mse8x16;
+        default:
+          return vpx_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_12_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_12_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_12_mse8x16;
+        default:
+          return vpx_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // Refine the motion search range according to the frame dimension
 // for first pass test.
-static int get_search_range(const VP9_COMMON *cm) {
+static int get_search_range(const VP9_COMP *cpi) {
   int sr = 0;
-  const int dim = MIN(cm->width, cm->height);
+  const int dim = MIN(cpi->initial_width, cpi->initial_height);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL)
     ++sr;
@@ -309,16 +397,21 @@
   int num00, tmp_err, n;
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
-  const int new_mv_mode_penalty = 256;
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  const int sr = get_search_range(&cpi->common);
+  const int sr = get_search_range(cpi);
   step_param += sr;
   further_steps -= sr;
 
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
@@ -370,11 +463,11 @@
   }
 }
 
-static int find_fp_qindex() {
+static int find_fp_qindex(vpx_bit_depth_t bit_depth) {
   int i;
 
   for (i = 0; i < QINDEX_RANGE; ++i)
-    if (vp9_convert_qindex_to_q(i) >= 30.0)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q)
       break;
 
   if (i == QINDEX_RANGE)
@@ -396,24 +489,20 @@
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-void vp9_first_pass(VP9_COMP *cpi) {
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
-  int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height);
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
@@ -424,62 +513,87 @@
   int mvcount = 0;
   int intercount = 0;
   int second_ref_count = 0;
-  int intrapenalty = 256;
-  int neutral_count = 0;
+  const int intrapenalty = INTRA_MODE_PENALTY;
+  double neutral_count;
+  int intra_skip_count = 0;
+  int image_data_start_row = INVALID_ROW;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
-  uint32_t lastmv_as_int = 0;
+  MV lastmv = {0, 0};
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
 
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs);
+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
   }
 #endif
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
+
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
 
   set_first_pass_params(cpi);
-  vp9_set_quantizer(cm, find_fp_qindex());
+  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
-  if (is_spatial_svc(cpi)) {
-    MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
-    const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
-    twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
+  if (lc != NULL) {
+    twopass = &lc->twopass;
 
-    if (cpi->common.current_video_frame == 0) {
-      cpi->ref_frame_flags = 0;
+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+        REF_FRAMES) {
+      cpi->gld_fb_idx =
+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
     } else {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-      if (lc->current_video_frame_in_layer == 0)
-        cpi->ref_frame_flags = VP9_GOLD_FLAG;
-      else
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = 0;
     }
 
+    if (lc->current_video_frame_in_layer == 0)
+      cpi->ref_frame_flags = 0;
+
     vp9_scale_references(cpi);
 
     // Use either last frame or alt frame for motion search.
     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      ref_frame = LAST_FRAME;
-    } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      ref_frame = GOLDEN_FRAME;
+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (first_ref_buf == NULL)
+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
     }
 
-    if (scaled_ref_buf != NULL)
-      first_ref_buf = scaled_ref_buf;
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
+    } else {
+      gld_yv12 = NULL;
+    }
 
-    recon_y_stride = new_yv12->y_stride;
-    recon_uv_stride = new_yv12->uv_stride;
-    uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
-    // Disable golden frame for svc first pass for now.
-    gld_yv12 = NULL;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    set_ref_ptrs(cm, xd,
+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
 
     cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
                                         &cpi->scaled_source);
@@ -488,9 +602,12 @@
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
 
+  if (!frame_is_intra_only(cm)) {
+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
@@ -510,10 +627,12 @@
   // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    int_mv best_ref_mv;
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
-    best_ref_mv.as_int = 0;
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    MV best_ref_mv = {0, 0};
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
@@ -529,13 +648,15 @@
     for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
 #if CONFIG_FP_MB_STATS
       const int mb_index = mb_row * cm->mb_cols + mb_col;
 #endif
 
-      vp9_clear_system_state();
+      vpx_clear_system_state();
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -548,24 +669,64 @@
                      mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
 
-      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        const int energy = vp9_block_energy(cpi, x, bsize);
-        error_weight = vp9_vaq_inv_q_ratio(energy);
-      }
-
       // Do intra 16x16 prediction.
       x->skip_encode = 0;
       xd->mi[0]->mbmi.mode = DC_PRED;
       xd->mi[0]->mbmi.tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
       vp9_encode_intra_block_plane(x, bsize, 0);
-      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 
-      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_clear_system_state();
-        this_error = (int)(this_error * error_weight);
+      // Keep a record of blocks that have almost no intra error residual
+      // (i.e. are in effect completely flat and untextured in the intra
+      // domain). In natural videos this is uncommon, but it is much more
+      // common in animations, graphics and screen content, so may be used
+      // as a signal to detect these types of content.
+      if (this_error < UL_INTRA_THRESH) {
+        ++intra_skip_count;
+      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+        image_data_start_row = mb_row;
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            break;
+          case VPX_BITS_10:
+            this_error >>= 4;
+            break;
+          case VPX_BITS_12:
+            this_error >>= 8;
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                        "VPX_BITS_10 or VPX_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      vpx_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+#else
+      level_sample = x->plane[0].src.buf[0];
+#endif
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
+
       // Intrapenalty below deals with situations where the intra and inter
       // error scores are very low (e.g. a plain black frame).
       // We do not have special cases in first pass for 0,0 and nearest etc so
@@ -591,16 +752,26 @@
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search.
-      if (cm->current_video_frame > 0) {
+      if ((lc == NULL && cm->current_video_frame > 0) ||
+          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
         int tmp_err, motion_error, raw_motion_error;
-        int_mv mv, tmp_mv;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = {0, 0} , tmp_mv = {0, 0};
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-        motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                            &xd->plane[0].pre[0]);
-        // Assume 0,0 motion with no mv overhead.
-        mv.as_int = tmp_mv.as_int = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -609,51 +780,60 @@
             cpi->unscaled_last_source->y_buffer + recon_yoffset;
         unscaled_last_source_buf_2d.stride =
             cpi->unscaled_last_source->y_stride;
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || is_spatial_svc(cpi)) {
+        if (raw_motion_error > 25 || lc != NULL) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                   &motion_error);
-          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();
-            motion_error = (int)(motion_error * error_weight);
-          }
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
 
           // If the current best reference mv is not centered on 0,0 then do a
           // 0,0 based search as well.
-          if (best_ref_mv.as_int) {
+          if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
-            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-              vp9_clear_system_state();
-              tmp_err = (int)(tmp_err * error_weight);
-            }
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
-              mv.as_int = tmp_mv.as_int;
+              mv = tmp_mv;
             }
           }
 
           // Search in an older reference frame.
-          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+          if (((lc == NULL && cm->current_video_frame > 1) ||
+               (lc != NULL && lc->current_video_frame_in_layer > 1))
+              && gld_yv12 != NULL) {
             // Assume 0,0 motion with no mv overhead.
             int gf_motion_error;
 
             xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
-
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                     &gf_motion_error);
-            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-              vp9_clear_system_state();
-              gf_motion_error = (int)(gf_motion_error * error_weight);
+#if CONFIG_VP9_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
             }
+#else
+            gf_motion_error = get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+                                     &gf_motion_error);
 
             if (gf_motion_error < motion_error && gf_motion_error < this_error)
               ++second_ref_count;
@@ -679,7 +859,8 @@
         }
 
         // Start by assuming that intra mode is best.
-        best_ref_mv.as_int = 0;
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
 
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
@@ -696,32 +877,41 @@
 #endif
 
         if (motion_error <= this_error) {
+          vpx_clear_system_state();
+
           // Keep a count of cases where the inter and intra were very close
           // and very low. This helps with scene cut detection for example in
           // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              this_error < 2 * intrapenalty)
-            ++neutral_count;
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count += (double)motion_error /
+                             DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
 
-          mv.as_mv.row *= 8;
-          mv.as_mv.col *= 8;
+          mv.row *= 8;
+          mv.col *= 8;
           this_error = motion_error;
           xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0] = mv;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
           xd->mi[0]->mbmi.tx_size = TX_4X4;
           xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.as_mv.row;
-          sum_mvr_abs += abs(mv.as_mv.row);
-          sum_mvc += mv.as_mv.col;
-          sum_mvc_abs += abs(mv.as_mv.col);
-          sum_mvrs += mv.as_mv.row * mv.as_mv.row;
-          sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
           ++intercount;
 
-          best_ref_mv.as_int = mv.as_int;
+          best_ref_mv = mv;
 
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
@@ -739,7 +929,7 @@
           }
 #endif
 
-          if (mv.as_int) {
+          if (!is_zero_mv(&mv)) {
             ++mvcount;
 
 #if CONFIG_FP_MB_STATS
@@ -770,33 +960,33 @@
 #endif
 
             // Non-zero vector, was it different from the last non zero vector?
-            if (mv.as_int != lastmv_as_int)
+            if (!is_equal_mv(&mv, &lastmv))
               ++new_mv_count;
-            lastmv_as_int = mv.as_int;
+            lastmv = mv;
 
             // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 --sum_in_vectors;
             }
 
             // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 --sum_in_vectors;
             }
           }
@@ -822,33 +1012,61 @@
     x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
 
-    vp9_clear_system_state();
+    vpx_clear_system_state();
   }
 
-  vp9_clear_system_state();
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((image_data_start_row > cm->mb_rows / 2) ||
+      (image_data_start_row == INVALID_ROW)) {
+    image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (image_data_start_row > 0) {
+    intra_skip_count =
+      MAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  }
+
   {
     FIRSTPASS_STATS fps;
+    // The minimum error here insures some bit allocation to frames even
+    // in static regions. The allocation per MB declines for larger formats
+    // where the typical "real" energy per MB also falls.
+    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
 
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
-    fps.intra_error = (double)(intra_error >> 8);
-    fps.coded_error = (double)(coded_error >> 8);
-    fps.sr_coded_error = (double)(sr_coded_error >> 8);
+    fps.coded_error = (double)(coded_error >> 8) + min_err;
+    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+    fps.intra_error = (double)(intra_error >> 8) + min_err;
     fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = (double)neutral_count / cm->MBs;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
       fps.mvr_abs = (double)sum_mvr_abs / mvcount;
       fps.MVc = (double)sum_mvc / mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
+      fps.MVrv = ((double)sum_mvrs -
+                  ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs -
+                  ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / cm->MBs;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
     } else {
       fps.MVr = 0.0;
       fps.mvr_abs = 0.0;
@@ -864,7 +1082,7 @@
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
+    fps.duration = (double)(source->ts_end - source->ts_start);
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
@@ -886,26 +1104,30 @@
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-      vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
     }
     twopass->sr_update_lag = 1;
   } else {
     ++twopass->sr_update_lag;
   }
 
-  vp9_extend_frame_borders(new_yv12);
+  vpx_extend_frame_borders(new_yv12);
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     vp9_update_reference_frames(cpi);
   } else {
-    // Swap frame pointers so last frame refers to the frame we just compressed.
-    swap_yv12(lst_yv12, new_yv12);
+    // The frame we just compressed now becomes the last frame.
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+               cm->new_fb_idx);
   }
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
-    vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
+      lc == NULL) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -926,19 +1148,20 @@
 
   ++cm->current_video_frame;
   if (cpi->use_svc)
-    vp9_inc_frame_in_layer(&cpi->svc);
+    vp9_inc_frame_in_layer(cpi);
 }
 
 static double calc_correction_factor(double err_per_mb,
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int q) {
+                                     int q,
+                                     vpx_bit_depth_t bit_depth) {
   const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
-                                pt_high);
+  const double power_term =
+      MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
 
   // Calculate correction factor.
   if (power_term < 1.0)
@@ -947,35 +1170,53 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
+// Larger image formats are expected to be a little harder to code relatively
+// given the same prediction error score. This in part at least relates to the
+// increased size and hence coding cost of motion vectors.
+#define EDIV_SIZE_FACTOR 800
+
 static int get_twopass_worst_quality(const VP9_COMP *cpi,
-                                     const FIRSTPASS_STATS *stats,
-                                     int section_target_bandwidth) {
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
   if (section_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
-    const int num_mbs = cpi->common.MBs;
-    const double section_err = stats->coded_error / stats->count;
-    const double err_per_mb = section_err / num_mbs;
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const int active_mbs = MAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
-                                         BPER_MB_NORMBITS) / num_mbs;
+                                         BPER_MB_NORMBITS) / active_mbs;
+
     int q;
     int is_svc_upper_layer = 0;
-    if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0)
+
+    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
+
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
-          calc_correction_factor(err_per_mb, ERR_DIVISOR,
+          calc_correction_factor(av_err_per_mb,
+                                 ERR_DIVISOR - ediv_size_correction,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
-                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q);
-      const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                                 factor * speed_term);
+                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
+                                 cpi->common.bit_depth);
+      const int bits_per_mb =
+        vp9_rc_bits_per_mb(INTER_FRAME, q,
+                           factor * speed_term * group_weight_factor,
+                           cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
     }
@@ -987,14 +1228,45 @@
   }
 }
 
-extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+static void setup_rf_level_maxq(VP9_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = MAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+void vp9_init_subsampling(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int w = cm->width;
+  const int h = cm->height;
+  int i;
+
+  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+    // Note: Frames with odd-sized dimensions may result from this scaling.
+    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+  }
+
+  setup_rf_level_maxq(cpi);
+}
+
+void calculate_coded_size(VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
-                             (svc->number_temporal_layers == 1);
-  TWO_PASS *const twopass = is_spatial_svc ?
+  const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
+                              (svc->number_temporal_layers > 1);
+  TWO_PASS *const twopass = is_two_pass_svc ?
       &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
   double frame_rate;
   FIRSTPASS_STATS *stats;
@@ -1017,7 +1289,7 @@
   // It is calculated based on the actual durations of all frames from the
   // first pass.
 
-  if (is_spatial_svc) {
+  if (is_two_pass_svc) {
     vp9_update_spatial_layer_framerate(cpi, frame_rate);
     twopass->bits_left = (int64_t)(stats->duration *
         svc->layer_context[svc->spatial_layer_id].target_bandwidth /
@@ -1028,17 +1300,6 @@
                              10000000.0);
   }
 
-  // Calculate a minimum intra value to be used in determining the IIratio
-  // scores used in the second pass. We have this minimum to make sure
-  // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF.
-  if (!is_spatial_svc) {
-    // We don't know the number of MBs for each layer at this point.
-    // So we will do it later.
-    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
-  }
-
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
@@ -1054,80 +1315,119 @@
     twopass->modified_error_max = (avg_error *
                                       oxcf->two_pass_vbrmax_section) / 100;
     while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(twopass, oxcf, s);
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
       ++s;
     }
     twopass->modified_error_left = modified_error_total;
   }
 
-  // Reset the vbr bits off target counter
+  // Reset the vbr bits off target counters
   cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    vp9_init_subsampling(cpi);
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const VP9_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                      ? cpi->initial_mbs : cpi->common.MBs;
+  double sr_diff =
+      (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+    frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = MIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_prediction_decay_rate(const VP9_COMMON *cm,
-                                        const FIRSTPASS_STATS *next_frame) {
-  // Look at the observed drop in prediction quality between the last frame
-  // and the GF buffer (which contains an older frame).
-  const double mb_sr_err_diff = (next_frame->sr_coded_error -
-                                     next_frame->coded_error) / cm->MBs;
-  const double second_ref_decay = mb_sr_err_diff <= 512.0
-      ? fclamp(pow(1.0 - (mb_sr_err_diff / 512.0), 0.5), 0.85, 1.0)
-      : 0.85;
-
-  return MIN(second_ref_decay, next_frame->pcnt_inter);
-}
-
-// This function gives an estimate of how badly we believe the prediction
-// quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
+static double get_zero_motion_factor(const VP9_COMP *cpi,
                                      const FIRSTPASS_STATS *frame) {
-  const double sr_ratio = frame->coded_error /
-                          DOUBLE_DIVIDE_CHECK(frame->sr_coded_error);
   const double zero_motion_pct = frame->pcnt_inter -
                                  frame->pcnt_motion;
-
-  return MIN(sr_ratio, zero_motion_pct);
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return MIN(sr_decay, zero_motion_pct);
 }
 
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const VP9_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+    (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                ZM_POWER_FACTOR));
+
+  return MAX(zero_motion_factor,
+             (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
 
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *twopass,
+static int detect_transition_to_still(VP9_COMP *cpi,
                                       int frame_interval, int still_interval,
                                       double loop_decay_rate,
                                       double last_decay_rate) {
-  int trans_to_still = 0;
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
 
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if (frame_interval > MIN_GF_INTERVAL &&
+  if (frame_interval > rc->min_gf_interval &&
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    const FIRSTPASS_STATS *position = twopass->stats_in;
-    FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
     for (j = 0; j < still_interval; ++j) {
-      if (EOF == input_stats(twopass, &tmp_next_frame))
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end)
         break;
 
-      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
         break;
     }
 
-    reset_fpf_position(twopass, position);
-
     // Only if it does do we signal a transition to still.
-    if (j == still_interval)
-      trans_to_still = 1;
+    return j == still_interval;
   }
 
-  return trans_to_still;
+  return 0;
 }
 
 // This function detects a flash through the high relative pcnt_second_ref
@@ -1174,19 +1474,26 @@
   }
 }
 
-// Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(const TWO_PASS *twopass,
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(VP9_COMP *cpi,
                                const FIRSTPASS_STATS *this_frame,
-                               double this_frame_mv_in_out) {
+                               double this_frame_mv_in_out,
+                               double max_boost) {
   double frame_boost;
+  const double lq =
+    vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                            cpi->common.bit_depth);
+  const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                ? cpi->initial_mbs : cpi->common.MBs;
 
-  // Underlying boost factor is based on inter intra error ratio.
-  if (this_frame->intra_error > twopass->gf_intra_err_min)
-    frame_boost = (IIFACTOR * this_frame->intra_error /
-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
-  else
-    frame_boost = (IIFACTOR * twopass->gf_intra_err_min /
-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+  // Correct for any inactive region in the image
+  num_mbs = (int)MAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
   // Slightly reduce boost if there is a net balance of motion out of the frame
@@ -1197,7 +1504,7 @@
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  return MIN(frame_boost, GF_RMAX);
+  return MIN(frame_boost, max_boost * boost_q_correction);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1233,13 +1540,14 @@
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
   }
 
   *f_boost = (int)boost_score;
@@ -1271,19 +1579,21 @@
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
   }
   *b_boost = (int)boost_score;
 
   arf_boost = (*f_boost + *b_boost);
   if (arf_boost < ((b_frames + f_frames) * 20))
     arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = MAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
 }
@@ -1373,7 +1683,8 @@
                                    double group_error, int gf_arf_bits) {
   RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
   int frame_index = 1;
@@ -1386,6 +1697,13 @@
   int mid_boost_bits = 0;
   int mid_frame_idx;
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+  int alt_frame_index = frame_index;
+  int has_temporal_layers = is_two_pass_svc(cpi) &&
+                            cpi->svc.number_temporal_layers > 1;
+
+  // Only encode alt reference frame in temporal base layer.
+  if (has_temporal_layers)
+    alt_frame_index = cpi->svc.number_temporal_layers;
 
   key_frame = cpi->common.frame_type == KEY_FRAME ||
               vp9_is_upper_layer_key_frame(cpi);
@@ -1396,17 +1714,17 @@
   // is also the golden frame.
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
-      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
-      twopass->gf_group.rf_level[0] = INTER_NORMAL;
-      twopass->gf_group.bit_allocation[0] = 0;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = OVERLAY_UPDATE;
+      gf_group->rf_level[0] = INTER_NORMAL;
+      gf_group->bit_allocation[0] = 0;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     } else {
-      twopass->gf_group.update_type[0] = GF_UPDATE;
-      twopass->gf_group.rf_level[0] = GF_ARF_STD;
-      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = GF_UPDATE;
+      gf_group->rf_level[0] = GF_ARF_STD;
+      gf_group->bit_allocation[0] = gf_arf_bits;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     }
 
     // Step over the golden frame / overlay frame
@@ -1421,25 +1739,33 @@
 
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
-    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
-    twopass->gf_group.arf_src_offset[frame_index] =
-      (unsigned char)(rc->baseline_gf_interval - 1);
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-    twopass->gf_group.arf_ref_idx[frame_index] =
+    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+
+    if (has_temporal_layers)
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval -
+                          cpi->svc.number_temporal_layers);
+    else
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval - 1);
+
+    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[alt_frame_index] =
       arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                          rc->source_alt_ref_active];
-    ++frame_index;
+    if (!has_temporal_layers)
+      ++frame_index;
 
     if (cpi->multi_arf_enabled) {
       // Set aside a slot for a level 1 arf.
-      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
-      twopass->gf_group.arf_src_offset[frame_index] =
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
         (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
-      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
       ++frame_index;
     }
   }
@@ -1448,12 +1774,16 @@
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
   // Allocate bits to the other frames in the group.
-  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
     int arf_idx = 0;
     if (EOF == input_stats(twopass, &frame_stats))
       break;
 
-    modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
+    if (has_temporal_layers && frame_index == alt_frame_index) {
+      ++frame_index;
+    }
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
 
     if (group_error > 0)
       err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
@@ -1469,16 +1799,16 @@
       if (frame_index <= mid_frame_idx)
         arf_idx = 1;
     }
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 
     target_frame_size = clamp(target_frame_size, 0,
                               MIN(max_bits, (int)total_group_bits));
 
-    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = LF_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
-    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    gf_group->bit_allocation[frame_index] = target_frame_size;
     ++frame_index;
   }
 
@@ -1486,23 +1816,23 @@
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
   // vp9_rc_get_second_pass_params() the data will be undefined.
-  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
-      twopass->gf_group.bit_allocation[2] =
-        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
-      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
     }
   } else {
-    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
 
   // Note whether multi-arf was enabled this group for next time.
@@ -1511,8 +1841,9 @@
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
@@ -1521,6 +1852,11 @@
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
   double gf_first_frame_err = 0.0;
   double mod_frame_err = 0.0;
 
@@ -1541,23 +1877,24 @@
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  int active_min_gf_interval;
   int64_t gf_group_bits;
   double gf_group_error_left;
   int gf_arf_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
-  if (cpi->common.frame_type != KEY_FRAME) {
+  if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
   }
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
   vp9_zero(next_frame);
 
-  gf_group_bits = 0;
-
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -1565,26 +1902,48 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+  if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
 
   // Motion breakout threshold for loop below depends on image size.
-  mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 10.0;
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
 
-  // Work out a maximum interval for the GF group.
+  // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
-  if (cpi->multi_arf_allowed) {
-    active_max_gf_interval = rc->max_gf_interval;
-  } else {
-   // The value chosen depends on the active Q range. At low Q we have
-   // bits to spare and are better with a smaller interval and smaller boost.
-   // At high Q when there are few bits to spare we are better with a longer
-   // interval to spread the cost of the GF.
-   active_max_gf_interval =
-     12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
+  {
+    int int_max_q =
+      (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
+                                   cpi->common.bit_depth));
+    int int_lbq =
+      (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
+                                   cpi->common.bit_depth));
+    active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
 
-   if (active_max_gf_interval > rc->max_gf_interval)
-     active_max_gf_interval = rc->max_gf_interval;
+    if (cpi->multi_arf_allowed) {
+      active_max_gf_interval = rc->max_gf_interval;
+    } else {
+      // The value chosen depends on the active Q range. At low Q we have
+      // bits to spare and are better with a smaller interval and smaller boost.
+      // At high Q when there are few bits to spare we are better with a longer
+      // interval to spread the cost of the GF.
+      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+
+      if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+    }
   }
 
   i = 0;
@@ -1592,8 +1951,13 @@
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
 
     if (EOF == input_stats(twopass, &next_frame))
       break;
@@ -1611,17 +1975,17 @@
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       zero_motion_accumulator =
-        MIN(zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->common, &next_frame));
+        MIN(zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
-      if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
         break;
@@ -1629,66 +1993,43 @@
     }
 
     // Calculate a boost number for this frame.
-    boost_score += decay_accumulator * calc_frame_boost(twopass, &next_frame,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
 
     // Break out conditions.
     if (
       // Break at active_max_gf_interval unless almost totally static.
-      (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
+      (i >= (active_max_gf_interval + arf_active_or_kf) &&
+            zero_motion_accumulator < 0.995) ||
       (
         // Don't break out with a very short interval.
-        (i > MIN_GF_INTERVAL) &&
-        ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
+        (i >= active_min_gf_interval + arf_active_or_kf) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
          (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < IIFACTOR)))) {
+         ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
       boost_score = old_boost_score;
       break;
     }
 
     *this_frame = next_frame;
-
     old_boost_score = boost_score;
   }
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
-  // Don't allow a gf too near the next kf.
-  if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
-      ++i;
-
-      if (EOF == input_stats(twopass, this_frame))
-        break;
-
-      if (i < rc->frames_to_key) {
-        mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
-        gf_group_err += mod_frame_err;
-      }
-    }
-  }
-
-  // Set the interval until the next gf.
-  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
-    rc->baseline_gf_interval = i - 1;
-  else
-    rc->baseline_gf_interval = i;
-
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
   if (allow_alt_ref &&
-      (i < cpi->oxcf.lag_in_frames) &&
-      (i >= MIN_GF_INTERVAL) &&
-      // For real scene cuts (not forced kfs) don't allow arf very near kf.
-      (rc->next_key_frame_forced ||
-      (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
+    (i < cpi->oxcf.lag_in_frames) &&
+    (i >= rc->min_gf_interval)) {
     // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
-                                   &b_boost);
+      &b_boost);
     rc->source_alt_ref_pending = 1;
 
     // Test to see if multi arf is appropriate.
@@ -1696,28 +2037,80 @@
       (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
       (zero_motion_accumulator < 0.995)) ? 1 : 0;
   } else {
-    rc->gfu_boost = (int)boost_score;
+    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
   }
 
+  // Set the interval until the next gf.
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+
+  // Only encode alt reference frame in temporal base layer. So
+  // baseline_gf_interval should be multiple of a temporal layer group
+  // (typically the frame distance between two base layer frames)
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
+    int j;
+    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+#if GROUP_ADAPTIVE_MAXQ
+      gf_group_raw_error += this_frame->coded_error;
+#endif
+      gf_group_skip_pct += this_frame->intra_skip_pct;
+      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+    }
+    rc->baseline_gf_interval = new_gf_interval;
+  }
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
-  // Calculate the extra bits to be used for boosted frame(s)
-  {
-    int q = rc->last_q[INTER_FRAME];
-    int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+      (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error  / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+      gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+      ((gf_group_inactive_zone_rows * 2) /
+       (rc->baseline_gf_interval * (double)cm->mb_rows));
 
-    // Set max and minimum boost and hence minimum allocation.
-    boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
-
-    // Calculate the extra bits to be used for boosted frame(s)
-    gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                       boost, gf_group_bits);
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = MAX(RC_FACTOR_MIN,
+                      (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = MIN(RC_FACTOR_MAX,
+                      (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q =
+      get_twopass_worst_quality(cpi, group_av_err,
+                                (group_av_skip_pct + group_av_inactive_zone),
+                                vbr_group_bits_per_frame,
+                                twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+      MAX(tmp_q, twopass->active_worst_quality >> 1);
   }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                     rc->gfu_boost, gf_group_bits);
 
   // Adjust KF group bits and error remaining.
   twopass->kf_group_error_left -= (int64_t)gf_group_err;
@@ -1730,7 +2123,7 @@
   // also a key frame in which case it has already been accounted for.
   if (rc->source_alt_ref_pending) {
     gf_group_error_left = gf_group_err - mod_frame_err;
-  } else if (cpi->common.frame_type != KEY_FRAME) {
+  } else if (is_key_frame == 0) {
     gf_group_error_left = gf_group_err - gf_first_frame_err;
   } else {
     gf_group_error_left = gf_group_err;
@@ -1748,28 +2141,68 @@
         calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
                                       rc->baseline_gf_interval);
   }
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to starting GF groups at normal frame size.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
 }
 
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
 static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
                              const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+    this_frame->pcnt_inter - this_frame->pcnt_neutral;
 
   // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < 0.10) &&
-      (next_frame->pcnt_second_ref < 0.10) &&
-      ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
         ((this_frame->intra_error /
-          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+          KF_II_ERR_THRESHOLD) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
+          DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
          ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
+          DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
     int i;
     const FIRSTPASS_STATS *start_pos = twopass->stats_in;
     FIRSTPASS_STATS local_next_frame = *next_frame;
@@ -1779,11 +2212,11 @@
 
     // Examine how well the key frame predicts subsequent frames.
     for (i = 0; i < 16; ++i) {
-      double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
-      if (next_iiratio > RMAX)
-        next_iiratio = RMAX;
+      if (next_iiratio > KF_II_MAX)
+        next_iiratio = KF_II_MAX;
 
       // Cumulative effect of decay in prediction quality.
       if (local_next_frame.pcnt_inter > 0.85)
@@ -1831,13 +2264,16 @@
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const FIRSTPASS_STATS first_frame = *this_frame;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
+  int loop_decay_counter = 0;
   double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
@@ -1849,7 +2285,7 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // Reset the GF group data structures.
-  vp9_zero(twopass->gf_group);
+  vp9_zero(*gf_group);
 
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -1867,30 +2303,30 @@
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame);
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key &&
-        lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
       double loop_decay_rate;
 
       // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
         break;
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -1902,7 +2338,7 @@
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i,
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
@@ -1932,11 +2368,11 @@
     // Reset to the start of the group.
     reset_fpf_position(twopass, start_position);
 
-    kf_group_err = 0;
+    kf_group_err = 0.0;
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame);
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -1947,10 +2383,22 @@
     rc->next_key_frame_forced = 0;
   }
 
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
+    int j;
+    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    }
+    rc->frames_to_key = new_frame_to_key;
+  }
+
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
@@ -1978,43 +2426,38 @@
   // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
-  // Scan through the kf group collating various stats used to deteermine
+  // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
   decay_accumulator = 1.0;
   boost_score = 0.0;
-  for (i = 0; i < rc->frames_to_key; ++i) {
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Monitor for static sections.
     zero_motion_accumulator =
       MIN(zero_motion_accumulator,
-          get_zero_motion_factor(&cpi->common, &next_frame));
+          get_zero_motion_factor(cpi, &next_frame));
 
-    // For the first few frames collect data to decide kf boost.
-    if (i <= (rc->max_gf_interval * 2)) {
-      double r;
-      if (next_frame.intra_error > twopass->kf_intra_err_min)
-        r = (IIKFACTOR2 * next_frame.intra_error /
-             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-      else
-        r = (IIKFACTOR2 * twopass->kf_intra_err_min /
-             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-
-      if (r > RMAX)
-        r = RMAX;
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+        calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
 
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate = get_prediction_decay_rate(&cpi->common,
-                                                                 &next_frame);
+        const double loop_decay_rate =
+          get_prediction_decay_rate(cpi, &next_frame);
         decay_accumulator *= loop_decay_rate;
         decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
       }
-
-      boost_score += (decay_accumulator * r);
+      boost_score += (decay_accumulator * frame_boost);
     }
   }
+  av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -2026,23 +2469,31 @@
       calculate_section_intra_ratio(start_position, twopass->stats_in_end,
                                     rc->frames_to_key);
 
+  // Apply various clamps for min and max boost
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = MAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = MAX(rc->kf_boost, MIN_KF_BOOST);
+
   // Work out how many bits to allocate for the key frame itself.
-  rc->kf_boost = (int)boost_score;
-
-  if (rc->kf_boost  < (rc->frames_to_key * 3))
-    rc->kf_boost  = (rc->frames_to_key * 3);
-  if (rc->kf_boost   < MIN_KF_BOOST)
-    rc->kf_boost = MIN_KF_BOOST;
-
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+      (double)(twopass->kf_group_bits - kf_bits) /
+      (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
-  twopass->gf_group.bit_allocation[0] = kf_bits;
-  twopass->gf_group.update_type[0] = KF_UPDATE;
-  twopass->gf_group.rf_level[0] = KF_STD;
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2051,27 +2502,15 @@
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
   twopass->modified_error_left -= kf_group_err;
-}
 
-// For VBR...adjustment to the frame target based on error from previous frames
-void vbr_rate_correction(int * this_frame_target,
-                         const int64_t vbr_bits_off_target) {
-  int max_delta = (*this_frame_target * 15) / 100;
-
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to normal-sized frame on keyframes.
+    cpi->rc.next_frame_size_selector = UNSCALED;
   }
 }
 
 // Define the reference buffers that will be updated post encode.
-void configure_buffer_updates(VP9_COMP *cpi) {
+static void configure_buffer_updates(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
 
   cpi->rc.is_src_frame_alt_ref = 0;
@@ -2106,7 +2545,11 @@
       assert(0);
       break;
   }
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
+    if (cpi->svc.temporal_layer_id > 0) {
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+    }
     if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
       cpi->refresh_golden_frame = 0;
     if (cpi->alt_ref_source == NULL)
@@ -2114,20 +2557,38 @@
   }
 }
 
+static int is_skippable_frame(const VP9_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   int frames_left;
   FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS this_frame_copy;
 
   int target_rate;
-  LAYER_CONTEXT *lc = NULL;
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (is_spatial_svc(cpi)) {
-    lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+  if (lc != NULL) {
     frames_left = (int)(twopass->total_stats.count -
                   lc->current_video_frame_in_layer);
   } else {
@@ -2140,21 +2601,16 @@
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
     configure_buffer_updates(cpi);
-    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+    target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
-    // Correction to rate target based on prior over or under shoot.
-    if (cpi->oxcf.rc_mode == VPX_VBR)
-      vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
-
-    vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
-    if (is_spatial_svc(cpi)) {
+    if (lc != NULL) {
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
@@ -2165,84 +2621,121 @@
       }
     }
 
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip &&
+        cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
     return;
   }
 
-  vp9_clear_system_state();
-
-  if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) {
-    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
-  }
+  vpx_clear_system_state();
 
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
-             (is_spatial_svc(cpi) &&
-              lc->current_video_frame_in_layer == 0)) {
+             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
-    const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats,
-                                                section_target_bandwidth);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+      twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+      twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+      (twopass->total_left_stats.inactive_zone_rows * 2) /
+      ((double)cm->mb_rows * section_length);
+    const int tmp_q =
+      get_twopass_worst_quality(cpi, section_error,
+                                section_intra_skip + section_inactive_zone,
+                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
     twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
-    rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
     return;
 
-  // Local copy of the current frame's first pass stats.
-  this_frame_copy = this_frame;
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
 
   // Keyframe and section processing.
-  if (rc->frames_to_key == 0 ||
-      (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
     // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, &this_frame_copy);
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
   } else {
     cm->frame_type = INTER_FRAME;
   }
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     if (cpi->svc.spatial_layer_id == 0) {
       lc->is_key_frame = (cm->frame_type == KEY_FRAME);
-      if (lc->is_key_frame)
+      if (lc->is_key_frame) {
         cpi->ref_frame_flags &=
             (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+        lc->frames_from_key_frame = 0;
+        // Encode an intra only empty frame since we have a key frame.
+        cpi->svc.encode_intra_empty_frame = 1;
+      }
     } else {
       cm->frame_type = INTER_FRAME;
       lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
 
       if (lc->is_key_frame) {
         cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+        lc->frames_from_key_frame = 0;
       }
     }
   }
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame_copy);
-
-    if (twopass->gf_zeromotion_pct > 995) {
-      // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for show frame, i.e. set allow_encode_breakout to
-      // ENCODE_BREAKOUT_LIMITED.
-      if (!cm->show_frame)
-        cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
-      else
-        cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED;
-    }
+    define_gf_group(cpi, &this_frame);
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (!is_spatial_svc(cpi))
+    if (lc != NULL)
       cpi->refresh_golden_frame = 1;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n",
+              cm->current_video_frame, rc->frames_till_gf_update_due,
+              rc->kf_boost, arf_count, rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
   }
 
   configure_buffer_updates(cpi);
 
-  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else
@@ -2250,36 +2743,114 @@
 
   rc->base_frame_target = target_rate;
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR)
-    vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
-
-  vp9_rc_set_frame_target(cpi, target_rate);
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+      log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
   // sign of this value, a limited % adjustment is made to the target rate
   // of subsequent frames, to try and push it back towards 0. This method
   // is designed to prevent extreme behaviour at the end of a clip
   // or group of frames.
-  const int bits_used = rc->base_frame_target;
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-
   twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
 
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+      (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
   if (cpi->common.frame_type != KEY_FRAME &&
       !vp9_is_upper_layer_key_frame(cpi)) {
     twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
   }
   twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
 
   // Increment the gf group index ready for the next frame.
   ++twopass->gf_group.index;
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+      rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+    // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+          fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+          MIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+            (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
+                                        minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = MIN(twopass->extend_minq_fast,
+                                        minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
 }

diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h
index bf8c9fd..49f9da3 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libvpx/vp9/encoder/vp9_firstpass.h

@@ -39,8 +39,11 @@
 } FIRSTPASS_MB_STATS;
 #endif
 
+#define VLOW_MOTION_THRESHOLD 950
+
 typedef struct {
   double frame;
+  double weight;
   double intra_error;
   double coded_error;
   double sr_coded_error;
@@ -48,6 +51,9 @@
   double pcnt_motion;
   double pcnt_second_ref;
   double pcnt_neutral;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
   double MVr;
   double mvr_abs;
   double MVc;
@@ -70,6 +76,13 @@
   FRAME_UPDATE_TYPES = 5
 } FRAME_UPDATE_TYPE;
 
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
 typedef struct {
   unsigned char index;
   RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
@@ -93,26 +106,35 @@
   double modified_error_min;
   double modified_error_max;
   double modified_error_left;
-  double kf_intra_err_min;
-  double gf_intra_err_min;
+  double mb_av_energy;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
   uint8_t *this_frame_mb_stats;
   FIRSTPASS_MB_STATS firstpass_mb_stats;
 #endif
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
 
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
   int sr_update_lag;
 
   int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
   int gf_zeromotion_pct;
-
   int active_worst_quality;
+  int baseline_active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
 
   GF_GROUP gf_group;
 } TWO_PASS;
@@ -121,14 +143,22 @@
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
-void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+void vp9_init_subsampling(struct VP9_COMP *cpi);
+
+void calculate_coded_size(struct VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index e743517..8787be8 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c

@@ -38,7 +38,7 @@
       unsigned int i;
 
       for (i = 0; i < ctx->max_sz; i++)
-        vp9_free_frame_buffer(&ctx->buf[i].img);
+        vpx_free_frame_buffer(&ctx->buf[i].img);
       free(ctx->buf);
     }
     free(ctx);
@@ -50,6 +50,9 @@
                                          unsigned int height,
                                          unsigned int subsampling_x,
                                          unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
                                          unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
@@ -62,15 +65,20 @@
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
+    const int legacy_byte_alignment = 0;
     unsigned int i;
     ctx->max_sz = depth;
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
     if (!ctx->buf)
       goto bail;
     for (i = 0; i < depth; i++)
-      if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
+      if (vpx_alloc_frame_buffer(&ctx->buf[i].img,
                                  width, height, subsampling_x, subsampling_y,
-                                 VP9_ENC_BORDER_IN_PIXELS))
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 legacy_byte_alignment))
         goto bail;
   }
   return ctx;
@@ -82,19 +90,40 @@
 #define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags) {
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
   int row, col, active_end;
   int mb_rows = (src->y_height + 15) >> 4;
   int mb_cols = (src->y_width + 15) >> 4;
 #endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
 
   if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
     return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
 
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width ||
+                      height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
 #if USE_PARTIAL_COPY
   // TODO(jkoleszar): This is disabled for now, as
   // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
@@ -103,7 +132,7 @@
   // 1. Lookahead queue has has size of 1.
   // 2. Active map is provided.
   // 3. This is not a key frame, golden nor altref frame.
-  if (ctx->max_sz == 1 && active_map && !flags) {
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
     for (row = 0; row < mb_rows; ++row) {
       col = 0;
 
@@ -139,11 +168,32 @@
       active_map += mb_cols;
     }
   } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (vpx_alloc_frame_buffer(&new_img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 0))
+          return 1;
+      vpx_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
     vp9_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
   }
-#else
-  // Partial copy not implemented yet
-  vp9_copy_and_extend_frame(src, &buf->img);
 #endif
 
   buf->ts_start = ts_start;

diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h
index 678c51a..1382038 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/libvpx/vp9/encoder/vp9_lookahead.h

@@ -30,10 +30,6 @@
   int64_t             ts_start;
   int64_t             ts_end;
   unsigned int        flags;
-
-#if CONFIG_SPATIAL_SVC
-  vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS];
-#endif
 };
 
 // The max of past frames we want to keep in the queue.
@@ -56,6 +52,9 @@
                                          unsigned int height,
                                          unsigned int subsampling_x,
                                          unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
                                          unsigned int depth);
 
 
@@ -80,7 +79,11 @@
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags);
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags);
 
 
 /**\brief Get the next source buffer to encode

diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 6e04e2a..d59f315 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c

@@ -10,13 +10,16 @@
 
 #include <limits.h>
 
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_systemdependent.h"
 
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
@@ -24,7 +27,7 @@
                                               MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
   const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -34,6 +37,7 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   MV ref_full;
+  int cost_list[5];
 
   // Further step/diamond searches as necessary
   int step_param = mv_sf->reduce_first_step_size;
@@ -45,8 +49,9 @@
   ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
-                 ref_mv, dst_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_cost_list(cpi, cost_list),
+                 &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -55,8 +60,10 @@
     unsigned int sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
-        &sse, NULL, 0, 0);
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list),
+        NULL, NULL,
+        &distortion, &sse, NULL, 0, 0);
   }
 
   xd->mi[0]->mbmi.mode = NEWMV;
@@ -70,20 +77,20 @@
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+  return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
                                   int_mv *dst_mv, int mb_row, int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
   MV tmp_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
   dst_mv->as_int = 0;
 
@@ -113,13 +120,13 @@
 }
 
 static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
 
   dst_mv->as_int = 0;
@@ -127,7 +134,7 @@
   return err;
 }
 static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
-  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCK   *const x  = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   PREDICTION_MODE best_mode = -1, mode;
   unsigned int best_err = INT_MAX;
@@ -138,11 +145,11 @@
     unsigned int err;
 
     xd->mi[0]->mbmi.mode = mode;
-    vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+    vp9_predict_intra_block(xd, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
-    err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+    err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
     // find best
@@ -170,7 +177,7 @@
   int mb_row,
   int mb_col
 ) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int intra_error;
   VP9_COMMON *cm = &cpi->common;
@@ -225,7 +232,7 @@
                                        YV12_BUFFER_CONFIG *buf,
                                        YV12_BUFFER_CONFIG *golden_ref,
                                        YV12_BUFFER_CONFIG *alt_ref) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   VP9_COMMON *const cm = &cpi->common;
 
@@ -372,6 +379,8 @@
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
   YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
+  assert(golden_ref != NULL);
+
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
   if (n_frames <= cpi->rc.frames_till_gf_update_due)
@@ -383,9 +392,8 @@
   cpi->mbgraph_n_frames = n_frames;
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    vpx_memset(frame_stats->mb_stats, 0,
-               cm->mb_rows * cm->mb_cols *
-               sizeof(*cpi->mbgraph_stats[i].mb_stats));
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
   }
 
   // do motion search to find contribution of each reference to data
@@ -402,7 +410,7 @@
                                golden_ref, cpi->Source);
   }
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   separate_arf_mbs(cpi);
 }

diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index ae924d5..aa3e51c 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c

@@ -13,10 +13,13 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_reconinter.h"
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -90,13 +93,10 @@
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int error_per_bit) {
-  if (x->nmvsadcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
-                                      x->nmvsadcost) * error_per_bit, 8);
-  }
-  return 0;
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+                                    x->nmvsadcost) * error_per_bit, 8);
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -162,9 +162,9 @@
       error_per_bit + 4096) >> 13 : 0)
 
 
-// convert motion vector component to offset for svf calc
+// convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) {
-  return (x & 7) << 1;
+  return x & 7;
 }
 
 static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
@@ -256,70 +256,340 @@
     }                                                   \
   }
 
-int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  int thismse;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST                        \
+  {                                                     \
+    unsigned int second;                                \
+    int br0 = br;                                       \
+    int bc0 = bc;                                       \
+    assert(tr == br || tc == bc);                       \
+    if (tr == br && tc != bc) {                         \
+      kc = bc - tc;                                     \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+    }                                                   \
+    CHECK_BETTER(second, br0 + kr, bc0);                \
+    CHECK_BETTER(second, br0, bc0 + kc);                \
+    if (br0 != br || bc0 != bc) {                       \
+      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+    }                                                   \
+  }
 
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
+#define SETUP_SUBPEL_SEARCH                                                \
+  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const int src_stride = x->plane[0].src.stride;                           \
+  const MACROBLOCKD *xd = &x->e_mbd;                                       \
+  unsigned int besterr = INT_MAX;                                          \
+  unsigned int sse;                                                        \
+  unsigned int whichdir;                                                   \
+  int thismse;                                                             \
+  const unsigned int halfiters = iters_per_step;                           \
+  const unsigned int quarteriters = iters_per_step;                        \
+  const unsigned int eighthiters = iters_per_step;                         \
+  const int y_stride = xd->plane[0].pre[0].stride;                         \
+  const int offset = bestmv->row * y_stride + bestmv->col;                 \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
+                                                                           \
+  int rr = ref_mv->row;                                                    \
+  int rc = ref_mv->col;                                                    \
+  int br = bestmv->row * 8;                                                \
+  int bc = bestmv->col * 8;                                                \
+  int hstep = 4;                                                           \
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
+  int tr = br;                                                             \
+  int tc = bc;                                                             \
+                                                                           \
+  bestmv->row *= 8;                                                        \
   bestmv->col *= 8;
 
-  // calculate central point error
-  // TODO(yunqingwang): central pointer error was already calculated in full-
-  // pixel search, and can be passed in this function.
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                       const MV *bestmv,
+                                       const MV *ref_mv,
+                                       int error_per_bit,
+                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y,
+                                       int y_stride,
+                                       const uint8_t *second_pred,
+                                       int w, int h, int offset,
+                                       int *mvjcost, int *mvcost[2],
+                                       unsigned int *sse1,
+                                       int *distortion) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
   if (second_pred != NULL) {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
+                        sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
   } else {
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
+#else
+  (void) xd;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
   }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return besterr;
+}
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] &&
+         cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] &&
+         cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion,
+    unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *cost_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *cost_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
   tr = br;
   tc = bc;
 
@@ -361,6 +631,163 @@
   return besterr;
 }
 
+static const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
+int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                 MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int forced_stop,
+                                 int iters_per_step,
+                                 int *cost_list,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+
+  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+
+  (void) cost_list;  // to silence compiler warning
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1)
+      SECOND_LEVEL_CHECKS_BEST;
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 #undef MVC
 #undef PRE
 #undef CHECK_BETTER
@@ -394,18 +821,72 @@
 #define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
 #define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
 
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *ref_mv,
+                                      int sadpb,
+                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv,
+                                      int *cost_list) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int br = best_mv->row;
+  int bc = best_mv->col;
+  MV this_mv;
+  int i;
+  unsigned int sse;
+
+  this_mv.row = br;
+  this_mv.col = bc;
+  cost_list[0] = fn_ptr->vf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &this_mv),
+                            in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(x, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &this_mv),
+                                    in_what->stride, &sse) +
+          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                      x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      if (!is_mv_in(x, &this_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                      get_buf_from_mv(in_what, &this_mv),
+                                      in_what->stride, &sse) +
+            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                        x->errorperbit);
+    }
+  }
+}
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
+//
 static int vp9_pattern_search(const MACROBLOCK *x,
                               MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
-                              int do_init_search, int do_refine,
+                              int do_init_search,
+                              int *cost_list,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              const MV *center_mv, MV *best_mv,
+                              const MV *center_mv,
+                              MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -413,7 +894,7 @@
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
-  int i, j, s, t;
+  int i, s, t;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   int br, bc;
@@ -552,26 +1033,86 @@
     } while (s--);
   }
 
-  // Check 4 1-away neighbors if do_refine is true.
-  // For most well-designed schemes do_refine will not be necessary.
-  if (do_refine) {
-    static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: cost at the best integer pel
+  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_mv = { br, bc };
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+  return bestsad;
+}
 
-    for (j = 0; j < 16; j++) {
+// A specialized function where the smallest scale search candidates
+// are 4 1-away neighbors, and cost_list is non-null
+// TODO(debargha): Merge this function with the one above. Also remove
+// use_mvcost option since it is always 1, to save unnecessary branches.
+static int vp9_pattern_search_sad(const MACROBLOCK *x,
+                                  MV *ref_mv,
+                                  int search_param,
+                                  int sad_per_bit,
+                                  int do_init_search,
+                                  int *cost_list,
+                                  const vp9_variance_fn_ptr_t *vfp,
+                                  int use_mvcost,
+                                  const MV *center_mv,
+                                  MV *best_mv,
+                                  const int num_candidates[MAX_PATTERN_SCALES],
+                                  const MV candidates[MAX_PATTERN_SCALES]
+                                                     [MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
       int best_site = -1;
-      if (check_bounds(x, br, bc, 1)) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
+      if (check_bounds(x, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
                              in_what->stride);
           CHECK_BETTER
         }
       } else {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
           if (!is_mv_in(x, &this_mv))
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
@@ -580,19 +1121,220 @@
           CHECK_BETTER
         }
       }
-
       if (best_site == -1) {
-        break;
+        continue;
       } else {
-        br += neighbors[best_site].row;
-        bc += neighbors[best_site].col;
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= do_sad; s--) {
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = bestsad;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
       }
     }
   }
 
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: sad at the best integer pel
+  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+    if (cost_list[0] == INT_MAX) {
+      cost_list[0] = bestsad;
+      if (check_bounds(x, br, bc, 1)) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = { br + neighbors[i].row,
+                               bc + neighbors[i].col };
+          cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride);
+        }
+      } else {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (!is_mv_in(x, &this_mv))
+            cost_list[i + 1] = INT_MAX;
+          else
+            cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, &this_mv),
+                                       in_what->stride);
+        }
+      }
+    } else {
+      if (use_mvcost) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (cost_list[i + 1] != INT_MAX) {
+            cost_list[i + 1] +=
+                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+          }
+        }
+      }
+    }
+  }
   best_mv->row = br;
   best_mv->col = bc;
-
   return bestsad;
 }
 
@@ -634,6 +1376,7 @@
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
+                   int *cost_list,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
                    const MV *center_mv, MV *best_mv) {
@@ -658,7 +1401,7 @@
       { -1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, cost_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             hex_num_candidates, hex_candidates);
 }
@@ -668,6 +1411,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *cost_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -698,10 +1442,10 @@
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
-                            center_mv, best_mv,
-                            bigdia_num_candidates, bigdia_candidates);
+  return vp9_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
+                                do_init_search, cost_list, vfp, use_mvcost,
+                                center_mv, best_mv,
+                                bigdia_num_candidates, bigdia_candidates);
 }
 
 int vp9_square_search(const MACROBLOCK *x,
@@ -709,6 +1453,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *cost_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -740,7 +1485,7 @@
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, cost_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             square_num_candidates, square_candidates);
 }
@@ -750,12 +1495,13 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,  // must be zero for fast_hex
+                        int *cost_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, vfp, use_mvcost,
+                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
                         center_mv, best_mv);
 }
 
@@ -764,13 +1510,14 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,
+                        int *cost_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, vfp, use_mvcost,
-                           center_mv, best_mv);
+                           sad_per_bit, do_init_search, cost_list, vfp,
+                           use_mvcost, center_mv, best_mv);
 }
 
 #undef CHECK_BETTER
@@ -981,7 +1728,7 @@
           }
         }
         break;
-      };
+      }
 #endif
     } else if (best_address == in_what) {
       (*num00)++;
@@ -990,13 +1737,222 @@
   return bestsad;
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vp9_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+    {-1, 0}, {0, -1}, {0, 1}, {1, 0},
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  {
+    unsigned int this_sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return this_sad;
+  }
+#endif
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t * const pos[4] = {
+        ref_buf - ref_stride,
+        ref_buf - 1,
+        ref_buf + 1,
+        ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                   ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-
 int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
@@ -1040,7 +1996,7 @@
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
@@ -1049,6 +2005,11 @@
       *dst_mv = best_mv;
     }
   }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
   return bestsme;
 }
 
@@ -1110,7 +2071,7 @@
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1175,7 +2136,7 @@
     if (fn_ptr->sdx8f != NULL) {
       while ((c + 7) < col_max) {
         int i;
-        unsigned int sads[8];
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
 
         fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1199,7 +2160,7 @@
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1239,11 +2200,11 @@
   return best_sad;
 }
 
-int vp9_refining_search_sad_c(const MACROBLOCK *x,
-                              MV *ref_mv, int error_per_bit,
-                              int search_range,
-                              const vp9_variance_fn_ptr_t *fn_ptr,
-                              const MV *center_mv) {
+int vp9_refining_search_sad(const MACROBLOCK *x,
+                            MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   const struct buf_2d *const what = &x->plane[0].src;
@@ -1368,41 +2329,49 @@
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *cost_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
 
   switch (method) {
     case FAST_DIAMOND:
       var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case FAST_HEX:
       var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case HEX:
       var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           fn_ptr, 1, ref_mv, tmp_mv);
+                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case SQUARE:
       var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case BIGDIA:
       var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
       var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                    MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, fn_ptr, ref_mv, tmp_mv);
+                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default:
-      assert(!"Invalid search method.");
+      assert(0 && "Invalid search method.");
   }
 
   if (method != NSTEP && rd && var < var_max)

diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 298fbb6..817bd79 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h

@@ -13,7 +13,7 @@
 #define VP9_ENCODER_VP9_MCOMP_H_
 
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -66,19 +66,33 @@
 
 int vp9_init_search_range(int size);
 
-// Runs sequence of diamond searches in smaller steps for RD
+int vp9_refining_search_sad(const struct macroblock *x,
+                            struct mv *ref_mv,
+                            int sad_per_bit, int distance,
+                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+                                           MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col);
+
 typedef int (integer_mv_pattern_search_fn) (
     const MACROBLOCK *x,
     MV *ref_mv,
     int search_param,
     int error_per_bit,
     int do_init_search,
+    int *cost_list,
     const vp9_variance_fn_ptr_t *vf,
     int use_mvcost,
     const MV *center_mv,
@@ -98,12 +112,16 @@
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
+    int *cost_list,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
@@ -136,8 +154,10 @@
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *cost_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index d365489..8e19103 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c

@@ -14,6 +14,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -33,14 +34,29 @@
 }
 
 
-static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
-                            int filt_level, int partial_frame) {
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                VP9_COMP *const cpi,
+                                int filt_level, int partial_frame) {
   VP9_COMMON *const cm = &cpi->common;
-  int filt_err;
+  int64_t filt_err;
 
-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
-                        partial_frame);
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
   filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -55,17 +71,18 @@
   const int min_filter_level = 0;
   const int max_filter_level = get_max_filter_level(cpi);
   int filt_direction = 0;
-  int best_err, filt_best;
+  int64_t best_err;
+  int filt_best;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
-  int ss_err[MAX_LOOP_FILTER + 1];
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
 
   // Set each entry to -1
-  vpx_memset(ss_err, 0xFF, sizeof(ss_err));
+  memset(ss_err, 0xFF, sizeof(ss_err));
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -77,10 +94,9 @@
   while (filter_step > 0) {
     const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
     const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
-    int filt_err;
 
     // Bias against raising loop filter in favor of lowering it.
-    int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
       bias = (bias * cpi->twopass.section_intra_rating) / 20;
@@ -92,17 +108,14 @@
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
-        ss_err[filt_low] = filt_err;
-      } else {
-        filt_err = ss_err[filt_low];
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((filt_err - bias) < best_err) {
+      if ((ss_err[filt_low] - bias) < best_err) {
         // Was it actually better than the previous best?
-        if (filt_err < best_err)
-          best_err = filt_err;
+        if (ss_err[filt_low] < best_err)
+          best_err = ss_err[filt_low];
 
         filt_best = filt_low;
       }
@@ -111,14 +124,11 @@
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
-        ss_err[filt_high] = filt_err;
-      } else {
-        filt_err = ss_err[filt_high];
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
       }
       // Was it better than the previous best?
-      if (filt_err < (best_err - bias)) {
-        best_err = filt_err;
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
         filt_best = filt_high;
       }
     }
@@ -149,10 +159,29 @@
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = get_max_filter_level(cpi);
-    const int q = vp9_ac_quant(cm->base_qindex, 0);
+    const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);

diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index 6115f5a..cc018fc 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c

@@ -14,24 +14,37 @@
 #include <stdio.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
 
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 
-static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                       const TileInfo *const tile,
-                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                       int_mv *mv_ref_list,
-                       int mi_row, int mi_col) {
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
+static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
+                      const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
 
@@ -42,7 +55,7 @@
   int const_motion = 0;
 
   // Blank the reference vector list
-  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
@@ -58,7 +71,8 @@
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
+                        refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -75,7 +89,7 @@
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0]);
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -90,14 +104,15 @@
                                               * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
       }
     }
   }
 
  Done:
 
-  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+  x->mbmi_ext->mode_context[ref_frame] = counter_to_context[context_counter];
 
   // Clamp vectors
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
@@ -117,7 +132,7 @@
   const int sadpb = x->sadperbit16;
   MV mvp_full;
   const int ref = mbmi->ref_frame[0];
-  const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
+  const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   int dis;
   int rate_mode;
   const int tmp_col_min = x->mv_col_min;
@@ -125,12 +140,9 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   int rv = 0;
+  int cost_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
-  if (cpi->common.show_frame &&
-      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
-    return rv;
-
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
@@ -144,15 +156,16 @@
 
   assert(x->mv_best_ref_index[ref] <= 2);
   if (x->mv_best_ref_index[ref] < 2)
-    mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+    mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
   else
     mvp_full = x->pred_mv[ref];
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
-                        &tmp_mv->as_mv, INT_MAX, 0);
+  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                        cond_cost_list(cpi, cost_list),
+                        &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -166,7 +179,7 @@
   *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]]
+  rate_mode = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref]]
                                   [INTER_OFFSET(NEWMV)];
   rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) >
          best_rd_sofar);
@@ -178,9 +191,11 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
-    x->pred_mv[ref] = tmp_mv->as_mv;
+    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
 
   if (scaled_ref_frame) {
@@ -191,6 +206,248 @@
   return rv;
 }
 
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           int w, int h, unsigned int *sse, int *sum,
+                           int block_size, unsigned int *sse8x8,
+                           int *sum8x8, unsigned int *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      vpx_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride,
+                    &sse8x8[k], &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+          sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+          sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+          (b_width_log2_lookup[unit_size] +
+              b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                    MACROBLOCK *x, MACROBLOCKD *xd,
+                                    int *out_rate_sum, int64_t *out_dist_sum,
+                                    unsigned int *var_y, unsigned int *sse_y,
+                                    int mi_row, int mi_col, int *early_term) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = pd->dequant[0];
+  const uint32_t ac_quant = pd->dequant[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+  int skip_dc = 0;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[64] = {0};
+  int sum8x8[64] = {0};
+  unsigned int var8x8[64] = {0};
+  TX_SIZE tx_size;
+  int i, k;
+
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = MIN(max_txsize_lookup[bsize],
+                    tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+  } else {
+    tx_size = MIN(max_txsize_lookup[bsize],
+                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  assert(tx_size >= TX_8X8);
+  xd->mi[0]->mbmi.tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[16] = {0};
+    int sum16x16[16] = {0};
+    unsigned int var16x16[16] = {0};
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[4] = {0};
+    int sum32x32[4] = {0};
+    unsigned int var32x32[4] = {0};
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8) ? num8x8 :
+        ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+        ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+        ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    x->skip_txfm[0] = SKIP_TXFM_NONE;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test) {
+      x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+
+      if (dc_test)
+        x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+    } else if (dc_test) {
+      skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+    int skip_uv[2] = {0};
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+
+    // Transform skipping test in UV planes.
+    for (i = 1; i <= 2; i++) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0]->mbmi, pd);
+      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const int uv_bw = b_width_log2_lookup[uv_bsize];
+      const int uv_bh = b_height_log2_lookup[uv_bsize];
+      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+          (uv_bh - b_height_log2_lookup[unit_size]);
+      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      int j = i - 1;
+
+      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+          pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+        skip_uv[j] = 1;
+      else
+        break;
+    }
+
+    // If the transform in YUV planes are skippable, the mode search checks
+    // fewer inter modes and doesn't check intra modes.
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
 
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
@@ -204,42 +461,297 @@
   int64_t dist;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
+  const int64_t dc_thr = p->quant_thred[0] >> 6;
+  const int64_t ac_thr = p->quant_thred[1] >> 6;
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
   unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
                                            pd->dst.buf, pd->dst.stride, &sse);
+  int skip_dc = 0;
+
   *var_y = var;
   *sse_y = sse;
 
-  if (sse < dc_quant * dc_quant >> 6)
-    x->skip_txfm[0] = 1;
-  else if (var < ac_quant * ac_quant >> 6)
-    x->skip_txfm[0] = 2;
-  else
-    x->skip_txfm[0] = 0;
-
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+      xd->mi[0]->mbmi.tx_size =
+          MIN(max_txsize_lookup[bsize],
+              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       xd->mi[0]->mbmi.tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+      xd->mi[0]->mbmi.tx_size = TX_8X8;
+    else if (xd->mi[0]->mbmi.tx_size > TX_16X16)
+      xd->mi[0]->mbmi.tx_size = TX_16X16;
   } else {
-    xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                         tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    xd->mi[0]->mbmi.tx_size =
+        MIN(max_txsize_lookup[bsize],
+            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
-  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
-                               dc_quant >> 3, &rate, &dist);
-  *out_rate_sum = rate >> 1;
-  *out_dist_sum = dist << 3;
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    const BLOCK_SIZE unit_size =
+        txsize_to_bsize[xd->mi[0]->mbmi.tx_size];
+    const unsigned int num_blk_log2 =
+        (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
+        (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
+    const unsigned int sse_tx = sse >> num_blk_log2;
+    const unsigned int var_tx = var >> num_blk_log2;
 
-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+    x->skip_txfm[0] = SKIP_TXFM_NONE;
+    // Check if all ac coefficients can be quantized to zero.
+    if (var_tx < ac_thr || var == 0) {
+      x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+      // Check if dc coefficient can be quantized to zero.
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+    } else {
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   *out_rate_sum += rate;
   *out_dist_sum += dist << 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0, r, c;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  int eob_cost = 0;
+
+  (void)cpi;
+  vp9_subtract_plane(x, bsize, plane);
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_32X32:
+            vpx_fdct32x32_rd(src_diff, coeff, diff_stride);
+            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round_fp, p->quant_fp, p->quant_shift,
+                                  qcoeff, dqcoeff, pd->dequant, eob,
+                                  scan_order->scan, scan_order->iscan);
+            break;
+          case TX_16X16:
+            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_8X8:
+            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_4X4:
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          default:
+            assert(0);
+            break;
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *rate = 0;
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
+
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  if (*sse < INT64_MAX)
+    *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+
+        if (*eob == 1)
+          *rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable == 0) {
+    *rate <<= 10;
+    *rate += (eob_cost << 8);
+  }
+}
+#endif
+
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               int *out_rate_sum, int64_t *out_dist_sum,
+                               unsigned int *var_y, unsigned int *sse_y) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+
+  *out_rate_sum = 0;
+  *out_dist_sum = 0;
+
+  for (i = 1; i <= 2; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = pd->dequant[0];
+    const uint32_t ac_quant = pd->dequant[1];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    unsigned int var;
+
+    if (!x->color_sensitivity[i - 1])
+      continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    *var_y += var;
+    *sse_y += sse;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate >> 1;
+    *out_dist_sum += dist << 3;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                 ac_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate;
+    *out_dist_sum += dist << 4;
+  }
+}
+
 static int get_pred_buffer(PRED_BUFFER *p, int len) {
   int i;
 
@@ -253,7 +765,8 @@
 }
 
 static void free_pred_buffer(PRED_BUFFER *p) {
-  p->in_use = 0;
+  if (p != NULL)
+    p->in_use = 0;
 }
 
 static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
@@ -264,7 +777,6 @@
                                  struct buf_2d yv12_mb[][MAX_MB_PLANE],
                                  int *rate, int64_t *dist) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   unsigned int var = var_y, sse = sse_y;
@@ -280,16 +792,29 @@
     // The encode_breakout input
     const unsigned int min_thresh =
         MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int shift = (xd->bd << 1) - 16;
+#endif
 
     // Calculate threshold according to dequant value.
-    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
     // Adjust ac threshold according to partition size.
     thresh_ac >>=
-        8 - (b_width_log2(bsize) + b_height_log2(bsize));
+        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
     thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else {
     thresh_ac = 0;
     thresh_dc = 0;
@@ -314,19 +839,19 @@
                                     xd->plane[1].dst.stride, &sse_u);
 
     // U skipping condition checking
-    if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+    if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
       var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
                                       x->plane[2].src.stride,
                                       xd->plane[2].dst.buf,
                                       xd->plane[2].dst.stride, &sse_v);
 
       // V skipping condition checking
-      if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+      if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
         x->skip = 1;
 
         // The cost of skip bit needs to be added.
-        *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                     [INTER_OFFSET(this_mode)];
+        *rate = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                    [INTER_OFFSET(this_mode)];
 
         // More on this part of rate
         // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -343,80 +868,274 @@
   }
 }
 
+struct estimate_block_intra_args {
+  VP9_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int rate;
+  int64_t dist;
+};
+
+static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                                 TX_SIZE tx_size, void *arg) {
+  struct estimate_block_intra_args* const args = arg;
+  VP9_COMP *const cpi = args->cpi;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  int rate;
+  int64_t dist;
+  int64_t this_sse = INT64_MAX;
+  int is_skippable;
+
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  assert(plane == 0);
+  (void) plane;
+
+  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+  // Use source buffer as an approximation for the fully reconstructed buffer.
+  vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
+                          tx_size, args->mode,
+                          x->skip_encode ? p->src.buf : pd->dst.buf,
+                          x->skip_encode ? src_stride : dst_stride,
+                          pd->dst.buf, dst_stride,
+                          i, j, 0);
+
+  // TODO(jingning): This needs further refactoring.
+  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+            bsize_tx, MIN(tx_size, TX_16X16));
+  x->skip_txfm[0] = is_skippable;
+  rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rate += rate;
+  args->dist += dist;
+}
+
 static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+  {THR_DC, THR_V_PRED, THR_H_PRED, THR_TM},
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
-  {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
+};
+
+static const PREDICTION_MODE intra_mode_list[] = {
+  DC_PRED, V_PRED, H_PRED, TM_PRED
+};
+
+static int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED:
+        return 0;
+      case V_PRED:
+        return 1;
+      case H_PRED:
+        return 2;
+      case TM_PRED:
+        return 3;
+      default:
+        return -1;
+    }
+  }
+}
+
+static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
+                                           TileDataEnc *tile_data,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else
+    *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
+        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+}
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RD_COST this_rdc, best_rdc;
+  PREDICTION_MODE this_mode;
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+  const TX_SIZE intra_tx_size =
+      MIN(max_txsize_lookup[bsize],
+          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  MODE_INFO *const mic = xd->mi[0];
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+  (void) ctx;
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(&this_rdc);
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->mv[0].as_int = INVALID_MV;
+  mbmi->uv_mode = DC_PRED;
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    args.mode = this_mode;
+    args.rate = 0;
+    args.dist = 0;
+    mbmi->tx_size = intra_tx_size;
+    vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                           estimate_block_intra, &args);
+    this_rdc.rate = args.rate;
+    this_rdc.dist = args.dist;
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                             this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      mbmi->mode = this_mode;
+    }
+  }
+
+  *rd_cost = best_rdc;
+}
+
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+                                MACROBLOCKD *const xd,
+                                int ref_frame_cost[MAX_REF_FRAMES]) {
+  vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+    ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+  ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 8
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {LAST_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEWMV}
+};
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {LAST_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, NEWMV}
 };
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            const TileInfo *const tile,
-                            int mi_row, int mi_col,
-                            int *returnrate,
-                            int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[0];
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
-                             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  MV_REFERENCE_FRAME usable_ref_frame;
+  TX_SIZE best_tx_size = TX_SIZES;
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd = INT64_MAX;
-  int skip_txfm = 0;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
+  RD_COST this_rdc, best_rdc;
+  uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-
-  VP9_COMMON *cm = &cpi->common;
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
-
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  const int reduction_fac = (bsize <= BLOCK_16X16) ?
+      ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
-  const int64_t intra_mode_cost = 50;
-
-  unsigned char segment_id = mbmi->segment_id;
-  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
-  // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
-  INTERP_FILTER filter_ref = cm->interp_filter;
-  int bsl = mi_width_log2(bsize);
+  const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  INTERP_FILTER filter_ref;
+  const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
   int const_motion[MAX_REF_FRAMES] = { 0 };
-  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
-  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  int pixels_in_block = bh * bw;
+  const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
   // For speed 6, the result of interp filter is reused later in actual encoding
   // process.
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
   PRED_BUFFER tmp[4];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
+#endif
   struct buf_2d orig_dst = pd->dst;
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
-  int i;
+  const int pixels_in_block = bh * bw;
+  int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
+  int ref_frame_skip_mask = 0;
+  int idx;
+  int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  int ref_frame_cost[MAX_REF_FRAMES];
 
-  // CTX is used by the temporal denoiser which is currently being developed.
-  // TODO(jbb): when temporal denoiser is finished and in the default build
-  // remove the following line;
-  (void) ctx;
-  if (cpi->sf.reuse_inter_pred_sby) {
+  init_ref_frame_cost(cm, xd, ref_frame_cost);
+
+  if (reuse_inter_pred) {
+    int i;
     for (i = 0; i < 3; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+      else
+        tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
       tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       tmp[i].stride = bw;
       tmp[i].in_use = 0;
     }
@@ -428,41 +1147,52 @@
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+  else
+    filter_ref = cm->interp_filter;
+
   // initialize mode decisions
-  *returnrate = INT_MAX;
-  *returndistortion = INT64_MAX;
-  vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(rd_cost);
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
   mbmi->ref_frame[1] = NONE;
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                       tx_mode_to_biggest_tx_size[cm->tx_mode]);
-  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
-                        EIGHTTAP : cm->interp_filter;
-  mbmi->segment_id = segment_id;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_reset_frame_stats(ctx);
+#endif
+
+  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+    usable_ref_frame = LAST_FRAME;
+  } else {
+    usable_ref_frame = GOLDEN_FRAME;
+  }
+  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+
     x->pred_mv_sad[ref_frame] = INT_MAX;
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 
-    if (xd->up_available)
-      filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-    else if (xd->left_available)
-      filter_ref = xd->mi[-1]->mbmi.interp_filter;
-
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+      int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
 
-      if (!cm->error_resilient_mode)
-        vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
-                         candidates, mi_row, mi_col);
+      if (cm->use_prev_frame_mvs)
+        vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
+                         candidates, mi_row, mi_col, NULL, NULL,
+                         x->mbmi_ext->mode_context);
       else
-        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
+        const_motion[ref_frame] = mv_refs_rt(cm, x, xd, tile_info,
+                                             xd->mi[0],
                                              ref_frame, candidates,
                                              mi_row, mi_col);
 
@@ -474,188 +1204,308 @@
         vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
                     ref_frame, bsize);
     } else {
-      continue;
+      ref_frame_skip_mask |= (1 << ref_frame);
     }
+  }
+
+  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int i;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    if (cpi->use_svc)
+      this_mode = ref_mode_set_svc[idx].pred_mode;
+
+    if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
+      continue;
+
+    ref_frame = ref_mode_set[idx].ref_frame;
+    if (cpi->use_svc)
+      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+    if (const_motion[ref_frame] && this_mode == NEARMV)
+      continue;
+
+    i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+    if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
+      if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+        ref_frame_skip_mask |= (1 << ref_frame);
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
 
     // Select prediction reference frames.
-    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
-
-    clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
-    clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
 
     mbmi->ref_frame[0] = ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, NONE);
 
-    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate_mv = 0;
-      int mode_rd_thresh;
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    mode_rd_thresh = best_mode_skip_txfm ?
+            rd_threshes[mode_index] << 1 : rd_threshes[mode_index];
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      continue;
 
-      if (const_motion[ref_frame] &&
-          (this_mode == NEARMV || this_mode == ZEROMV))
-        continue;
+    if (this_mode == NEWMV) {
+      if (ref_frame > LAST_FRAME && !cpi->use_svc) {
+        int tmp_sad;
+        int dis, cost_list[5];
 
-      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
-        continue;
-
-      mode_rd_thresh =  rd_threshes[mode_idx[ref_frame - LAST_FRAME]
-                                            [this_mode - NEARESTMV]];
-      if (rd_less_than_thresh(best_rd, mode_rd_thresh,
-                              rd_thresh_freq_fact[this_mode]))
-        continue;
-
-      if (this_mode == NEWMV) {
-        if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+        if (bsize < BLOCK_16X16)
           continue;
-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                    &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rd))
+
+        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
+        if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
           continue;
+        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
+          continue;
+
+        frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+          &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+          x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+        cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+          &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step,
+          cond_cost_list(cpi, cost_list),
+          x->nmvjointcost, x->mvcost, &dis,
+          &x->pred_sse[ref_frame], NULL, 0, 0);
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost)) {
+        continue;
       }
+    }
 
-      if (this_mode != NEARESTMV &&
-          frame_mv[this_mode][ref_frame].as_int ==
-              frame_mv[NEARESTMV][ref_frame].as_int)
-          continue;
+    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+        frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3);
+      best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                   x->plane[0].src.stride,
+                                   pre_buf, pre_stride);
+      x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
+    }
 
-      mbmi->mode = this_mode;
-      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-
-      // Search for the best prediction filter type, when the resulting
-      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
-      // the last three bits are all zeros.
-      if (cpi->sf.reuse_inter_pred_sby) {
-        if (this_mode == NEARESTMV) {
-          this_mode_pred = &tmp[3];
-        } else {
-          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
-          pd->dst.buf = this_mode_pred->data;
-          pd->dst.stride = bw;
-        }
+    if (cpi->use_svc) {
+      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
+          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
+        const int pre_stride = xd->plane[0].pre[0].stride;
+        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
+            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
+        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                               x->plane[0].src.stride,
+                                               pre_buf, pre_stride);
+        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
       }
+    }
 
-      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search &&
-          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
-           (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
-        int pf_rate[3];
-        int64_t pf_dist[3];
-        unsigned int pf_var[3];
-        unsigned int pf_sse[3];
-        TX_SIZE pf_tx_size[3];
-        int64_t best_cost = INT64_MAX;
-        INTERP_FILTER best_filter = SWITCHABLE, filter;
-        PRED_BUFFER *current_pred = this_mode_pred;
 
-        for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
-          int64_t cost;
-          mbmi->interp_filter = filter;
-          vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
-                            &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
-          cost = RDCOST(x->rdmult, x->rddiv,
-                        vp9_get_switchable_rate(cpi) + pf_rate[filter],
-                        pf_dist[filter]);
-          pf_tx_size[filter] = mbmi->tx_size;
-          if (cost < best_cost) {
-            best_filter = filter;
-            best_cost = cost;
-            skip_txfm = x->skip_txfm[0];
+    if (this_mode != NEARESTMV &&
+        frame_mv[this_mode][ref_frame].as_int ==
+            frame_mv[NEARESTMV][ref_frame].as_int)
+      continue;
 
-            if (cpi->sf.reuse_inter_pred_sby) {
-              if (this_mode_pred != current_pred) {
-                free_pred_buffer(this_mode_pred);
-                this_mode_pred = current_pred;
-              }
+    mbmi->mode = this_mode;
+    mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
 
-              if (filter < EIGHTTAP_SHARP) {
-                current_pred = &tmp[get_pred_buffer(tmp, 3)];
-                pd->dst.buf = current_pred->data;
-                pd->dst.stride = bw;
-              }
+    // Search for the best prediction filter type, when the resulting
+    // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+    // the last three bits are all zeros.
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+
+    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
+        && (ref_frame == LAST_FRAME ||
+            (ref_frame == GOLDEN_FRAME && cpi->use_svc))
+        && (((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) != 0)) {
+      int pf_rate[3];
+      int64_t pf_dist[3];
+      unsigned int pf_var[3];
+      unsigned int pf_sse[3];
+      TX_SIZE pf_tx_size[3];
+      int64_t best_cost = INT64_MAX;
+      INTERP_FILTER best_filter = SWITCHABLE, filter;
+      PRED_BUFFER *current_pred = this_mode_pred;
+
+      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
+        int64_t cost;
+        mbmi->interp_filter = filter;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                          &pf_var[filter], &pf_sse[filter]);
+        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+        pf_tx_size[filter] = mbmi->tx_size;
+        if (cost < best_cost) {
+          best_filter = filter;
+          best_cost = cost;
+          skip_txfm = x->skip_txfm[0];
+
+          if (reuse_inter_pred) {
+            if (this_mode_pred != current_pred) {
+              free_pred_buffer(this_mode_pred);
+              this_mode_pred = current_pred;
+            }
+
+            if (filter < EIGHTTAP_SHARP) {
+              current_pred = &tmp[get_pred_buffer(tmp, 3)];
+              pd->dst.buf = current_pred->data;
+              pd->dst.stride = bw;
             }
           }
         }
-
-        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
-          free_pred_buffer(current_pred);
-
-        mbmi->interp_filter = best_filter;
-        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
-        rate = pf_rate[mbmi->interp_filter];
-        dist = pf_dist[mbmi->interp_filter];
-        var_y = pf_var[mbmi->interp_filter];
-        sse_y = pf_sse[mbmi->interp_filter];
-        x->skip_txfm[0] = skip_txfm;
-      } else {
-        mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
       }
 
-      rate += rate_mv;
-      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                [INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      if (reuse_inter_pred && this_mode_pred != current_pred)
+        free_pred_buffer(current_pred);
 
-      // Skipping checking: test to see if this block can be reconstructed by
-      // prediction only.
-      if (cpi->allow_encode_breakout) {
-        encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
-        if (x->skip) {
-          rate += rate_mv;
-          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      mbmi->interp_filter = best_filter;
+      mbmi->tx_size = pf_tx_size[best_filter];
+      this_rdc.rate = pf_rate[best_filter];
+      this_rdc.dist = pf_dist[best_filter];
+      var_y = pf_var[best_filter];
+      sse_y = pf_sse[best_filter];
+      x->skip_txfm[0] = skip_txfm;
+      if (reuse_inter_pred) {
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = this_mode_pred->stride;
+      }
+    } else {
+      mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+      // For large partition blocks, extra testing is done.
+      if (bsize > BLOCK_32X32 &&
+        !cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id) &&
+        cm->base_qindex) {
+        model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+                                &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+                                &this_early_term);
+      } else {
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
+      }
+    }
+
+    if (!this_early_term) {
+      this_sse = (int64_t)sse_y;
+      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+                &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+      x->skip_txfm[0] = is_skippable;
+      if (is_skippable) {
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+      } else {
+        if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+            RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+          this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        } else {
+          this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          this_rdc.dist = this_sse;
+          x->skip_txfm[0] = SKIP_TXFM_AC_DC;
         }
       }
 
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+      }
+    } else {
+      this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+          vp9_get_switchable_rate(cpi, xd) : 0;
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    }
+
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      int uv_rate = 0;
+      int64_t uv_dist = 0;
+      if (x->color_sensitivity[0])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+      if (x->color_sensitivity[1])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+                         &var_y, &sse_y);
+      this_rdc.rate += uv_rate;
+      this_rdc.dist += uv_dist;
+    }
+
+    this_rdc.rate += rate_mv;
+    this_rdc.rate +=
+        cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]][INTER_OFFSET(
+            this_mode)];
+    this_rdc.rate += ref_frame_cost[ref_frame];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+    // Skipping checking: test to see if this block can be reconstructed by
+    // prediction only.
+    if (cpi->allow_encode_breakout) {
+      encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+                           var_y, sse_y, yv12_mb, &this_rdc.rate,
+                           &this_rdc.dist);
+      if (x->skip) {
+        this_rdc.rate += rate_mv;
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate,
+                                 this_rdc.dist);
+      }
+    }
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
-      if (cpi->oxcf.noise_sensitivity > 0) {
-        vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
-                                        this_mode, ctx);
-      }
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+#else
+    (void)ctx;
 #endif
 
-      if (this_rd < best_rd || x->skip) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
-        best_mode = this_mode;
-        best_pred_filter = mbmi->interp_filter;
-        best_tx_size = mbmi->tx_size;
-        best_ref_frame = ref_frame;
-        skip_txfm = x->skip_txfm[0];
+    if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+      best_rdc = this_rdc;
+      best_mode = this_mode;
+      best_pred_filter = mbmi->interp_filter;
+      best_tx_size = mbmi->tx_size;
+      best_ref_frame = ref_frame;
+      best_mode_skip_txfm = x->skip_txfm[0];
+      best_early_term = this_early_term;
 
-        if (cpi->sf.reuse_inter_pred_sby) {
-          if (best_pred != NULL)
-            free_pred_buffer(best_pred);
-
-          best_pred = this_mode_pred;
-        }
-      } else {
-        if (cpi->sf.reuse_inter_pred_sby)
-          free_pred_buffer(this_mode_pred);
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pred);
+        best_pred = this_mode_pred;
       }
-
-      if (x->skip)
-        break;
+    } else {
+      if (reuse_inter_pred)
+        free_pred_buffer(this_mode_pred);
     }
-    // If the current reference frame is valid and we found a usable mode,
-    // we are done.
-    if (best_rd < INT64_MAX)
+
+    if (x->skip)
       break;
-  }
 
-  // If best prediction is not in dst buf, then copy the prediction block from
-  // temp buf to dst buf.
-  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
-    uint8_t *copy_from, *copy_to;
-
-    pd->dst = orig_dst;
-    copy_to = pd->dst.buf;
-
-    copy_from = best_pred->data;
-
-    vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
-                      bw, bh);
+    // If early termination flag is 1 and at least 2 modes are checked,
+    // the mode search is terminated.
+    if (best_early_term && idx > 0) {
+      x->skip = 1;
+      break;
+    }
   }
 
   mbmi->mode          = best_mode;
@@ -664,79 +1514,404 @@
   mbmi->ref_frame[0]  = best_ref_frame;
   mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-  x->skip_txfm[0] = skip_txfm;
+  x->skip_txfm[0] = best_mode_skip_txfm;
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (!x->skip && best_rd > inter_mode_thresh &&
-      bsize <= cpi->sf.max_intra_bsize) {
-    int i, j;
-    const int width  = num_4x4_blocks_wide_lookup[bsize];
-    const int height = num_4x4_blocks_high_lookup[bsize];
+  if (best_rdc.rdcost == INT64_MAX ||
+      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize)) {
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+    const TX_SIZE intra_tx_size =
+        MIN(max_txsize_lookup[bsize],
+            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    int i;
+    TX_SIZE best_intra_tx_size = TX_SIZES;
 
-    int rate2 = 0;
-    int64_t dist2 = 0;
-    const int dst_stride = cpi->sf.reuse_inter_pred_sby ? bw : pd->dst.stride;
-    const int src_stride = p->src.stride;
-    int block_idx = 0;
-
-    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
-                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
-    const int step = 1 << tmp_tx_size;
-
-    if (cpi->sf.reuse_inter_pred_sby) {
-      pd->dst.buf = tmp[0].data;
-      pd->dst.stride = bw;
-    }
-
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      uint8_t *const src_buf_base = p->src.buf;
-      uint8_t *const dst_buf_base = pd->dst.buf;
-      for (j = 0; j < height; j += step) {
-        for (i = 0; i < width; i += step) {
-          p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
-          pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
-          // Use source buffer as an approximation for the fully reconstructed
-          // buffer
-          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
-                                  tmp_tx_size, this_mode,
-                                  p->src.buf, src_stride,
-                                  pd->dst.buf, dst_stride,
-                                  i, j, 0);
-          model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
-          rate2 += rate;
-          dist2 += dist;
-          ++block_idx;
-        }
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst.buf) {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                   this_mode_pred->data, this_mode_pred->stride,
+                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+        else
+          vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#else
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        best_pred = this_mode_pred;
       }
-      p->src.buf = src_buf_base;
-      pd->dst.buf = dst_buf_base;
+    }
+    pd->dst = orig_dst;
 
-      rate = rate2;
-      dist = dist2;
+    for (i = 0; i < 4; ++i) {
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+      int mode_rd_thresh = rd_threshes[mode_index];
 
-      rate += cpi->mbmode_cost[this_mode];
-      rate += intra_cost_penalty;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
+        continue;
 
-      if (cpi->sf.reuse_inter_pred_sby)
-        pd->dst = orig_dst;
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                              rd_thresh_freq_fact[mode_index]))
+        continue;
 
-      if (this_rd + intra_mode_cost < best_rd) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
-        mbmi->mode = this_mode;
-        mbmi->tx_size = tmp_tx_size;
-        mbmi->ref_frame[0] = INTRA_FRAME;
+      mbmi->mode = this_mode;
+      mbmi->ref_frame[0] = INTRA_FRAME;
+      args.mode = this_mode;
+      args.rate = 0;
+      args.dist = 0;
+      mbmi->tx_size = intra_tx_size;
+      vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                             estimate_block_intra, &args);
+      this_rdc.rate = args.rate;
+      this_rdc.dist = args.dist;
+      this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += ref_frame_cost[INTRA_FRAME];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        best_mode = this_mode;
+        best_intra_tx_size = mbmi->tx_size;
+        best_ref_frame = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
-      } else {
-        x->skip_txfm[0] = skip_txfm;
+        best_mode_skip_txfm = x->skip_txfm[0];
+      }
+    }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (best_ref_frame != INTRA_FRAME) {
+      mbmi->tx_size = best_tx_size;
+    } else {
+      mbmi->tx_size = best_intra_tx_size;
+    }
+  }
+
+  pd->dst = orig_dst;
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  x->skip_txfm[0] = best_mode_skip_txfm;
+
+  if (reuse_inter_pred && best_pred != NULL) {
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                 pd->dst.buf, pd->dst.stride, NULL, 0,
+                                 NULL, 0, bw, bh, xd->bd);
+      else
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          pd->dst.buf, pd->dst.stride, NULL, 0,
+                          NULL, 0, bw, bh);
+#else
+      vpx_convolve_copy(best_pred->data, best_pred->stride,
+                        pd->dst.buf, pd->dst.stride, NULL, 0,
+                        NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+  if (cpi->sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
+
+    if (best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list)/sizeof(PREDICTION_MODE);
+      int i;
+
+      // TODO(yunqingwang): Check intra mode mask and only update freq_fact
+      // for those valid modes.
+      for (i = 0; i < intra_modes; i++) {
+        update_thresh_freq_fact(cpi, tile_data, bsize, INTRA_FRAME,
+                                best_mode_idx, intra_mode_list[i]);
+      }
+    } else {
+      for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+        PREDICTION_MODE this_mode;
+        if (best_ref_frame != ref_frame) continue;
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          update_thresh_freq_fact(cpi, tile_data, bsize, ref_frame,
+                                  best_mode_idx, this_mode);
+        }
       }
     }
   }
 
-  return INT64_MAX;
+  *rd_cost = best_rdc;
+}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  unsigned char segment_id = mbmi->segment_id;
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  b_mode_info bsi[MAX_REF_FRAMES][4];
+  int ref_frame_skip_mask = 0;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  ctx->pred_pixel_ready = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    int_mv dummy_mv[2];
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+      int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
+      const struct scale_factors *const sf =
+                             &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+      vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
+                       candidates, mi_row, mi_col, NULL, NULL,
+                       mbmi_ext->mode_context);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &dummy_mv[0], &dummy_mv[1]);
+    } else {
+      ref_frame_skip_mask |= (1 << ref_frame);
+    }
+  }
+
+  mbmi->sb_type = bsize;
+  mbmi->tx_size = TX_4X4;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                        : cm->interp_filter;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    int64_t this_rd = 0;
+    int plane;
+
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+    // TODO(jingning, agrange): Scaling reference frame not supported for
+    // sub8x8 blocks. Is this supported now?
+    if (ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    mbmi->ref_frame[0] = ref_frame;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (plane = 0; plane < MAX_MB_PLANE; plane++)
+      xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int_mv b_mv[MB_MODE_COUNT];
+        int64_t b_best_rd = INT64_MAX;
+        const int i = idy * 2 + idx;
+        PREDICTION_MODE this_mode;
+        RD_COST this_rdc;
+        unsigned int var_y, sse_y;
+
+        struct macroblock_plane *p = &x->plane[0];
+        struct macroblockd_plane *pd = &xd->plane[0];
+
+        const struct buf_2d orig_src = p->src;
+        const struct buf_2d orig_dst = pd->dst;
+        struct buf_2d orig_pre[2];
+        memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+        // set buffer pointers for sub8x8 motion search.
+        p->src.buf =
+            &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+        pd->dst.buf =
+            &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+        pd->pre[0].buf =
+            &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+                                                    i, pd->pre[0].stride)];
+
+        b_mv[ZEROMV].as_int = 0;
+        b_mv[NEWMV].as_int = INVALID_MV;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col,
+                                      &b_mv[NEARESTMV],
+                                      &b_mv[NEARMV],
+                                      mbmi_ext->mode_context);
+
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          int b_rate = 0;
+          xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+          if (this_mode == NEWMV) {
+            const int step_param = cpi->sf.mv.fullpel_search_step_param;
+            MV mvp_full;
+            MV tmp_mv;
+            int cost_list[5];
+            const int tmp_col_min = x->mv_col_min;
+            const int tmp_col_max = x->mv_col_max;
+            const int tmp_row_min = x->mv_row_min;
+            const int tmp_row_max = x->mv_row_max;
+            int dummy_dist;
+
+            if (i == 0) {
+              mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+              mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+            } else {
+              mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3;
+              mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3;
+            }
+
+            vp9_set_mv_search_range(x, &mbmi_ext->ref_mvs[0]->as_mv);
+
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+                cond_cost_list(cpi, cost_list),
+                &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+                INT_MAX, 0);
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            // calculate the bit cost on motion vector
+            mvp_full.row = tmp_mv.row * 8;
+            mvp_full.col = tmp_mv.col * 8;
+
+            b_rate += vp9_mv_bit_cost(&mvp_full,
+                                      &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+
+            b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                          [INTER_OFFSET(NEWMV)];
+            if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+              continue;
+
+            cpi->find_fractional_mv_step(x, &tmp_mv,
+                                         &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[bsize],
+                                         cpi->sf.mv.subpel_force_stop,
+                                         cpi->sf.mv.subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         x->nmvjointcost, x->mvcost,
+                                         &dummy_dist,
+                                         &x->pred_sse[ref_frame], NULL, 0, 0);
+
+            xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
+          } else {
+            b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                          [INTER_OFFSET(this_mode)];
+          }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                    pd->dst.buf, pd->dst.stride,
+                                    &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                    &xd->block_refs[0]->sf,
+                                    4 * num_4x4_blocks_wide,
+                                    4 * num_4x4_blocks_high, 0,
+                                    vp9_filter_kernels[mbmi->interp_filter],
+                                    MV_PRECISION_Q3,
+                                    mi_col * MI_SIZE + 4 * (i & 0x01),
+                                    mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+          } else {
+#endif
+            vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                     pd->dst.buf, pd->dst.stride,
+                                     &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                     &xd->block_refs[0]->sf,
+                                     4 * num_4x4_blocks_wide,
+                                     4 * num_4x4_blocks_high, 0,
+                                     vp9_filter_kernels[mbmi->interp_filter],
+                                     MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i & 0x01),
+                                     mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif
+
+          model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                            &var_y, &sse_y);
+
+          this_rdc.rate += b_rate;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
+          if (this_rdc.rdcost < b_best_rd) {
+            b_best_rd = this_rdc.rdcost;
+            bsi[ref_frame][i].as_mode = this_mode;
+            bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv;
+          }
+        }  // mode search
+
+        // restore source and prediction buffer pointers.
+        p->src = orig_src;
+        pd->pre[0] = orig_pre[0];
+        pd->dst = orig_dst;
+        this_rd += b_best_rd;
+
+        xd->mi[0]->bmi[i] = bsi[ref_frame][i];
+        if (num_4x4_blocks_wide > 1)
+          xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i];
+        if (num_4x4_blocks_high > 1)
+          xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i];
+      }
+    }  // loop through sub8x8 blocks
+
+    if (this_rd < best_rd) {
+      best_rd = this_rd;
+      best_ref_frame = ref_frame;
+    }
+  }  // reference frames
+
+  mbmi->tx_size = TX_4X4;
+  mbmi->ref_frame[0] = best_ref_frame;
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      const int block = idy * 2 + idx;
+      xd->mi[0]->bmi[block] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_wide > 1)
+        xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_high > 1)
+        xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
+    }
+  }
+  mbmi->mode = xd->mi[0]->bmi[3].as_mode;
+  ctx->mic = *(xd->mi[0]);
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->skip_txfm[0] = SKIP_TXFM_NONE;
+  ctx->skip = 0;
+  // Dummy assignment for speed -5. No effect in speed -6.
+  rd_cost->rdcost = best_rd;
 }

diff --git a/libvpx/vp9/encoder/vp9_pickmode.h b/libvpx/vp9/encoder/vp9_pickmode.h
index 49c6feb..a43bb81 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.h
+++ b/libvpx/vp9/encoder/vp9_pickmode.h

@@ -17,19 +17,19 @@
 extern "C" {
 #endif
 
-typedef struct {
-  uint8_t *data;
-  int stride;
-  int in_use;
-} PRED_BUFFER;
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            const struct TileInfo *const tile,
-                            int mi_row, int mi_col,
-                            int *returnrate,
-                            int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx);
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize,
+                         PICK_MODE_CONTEXT *ctx);
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index eababdb..cb3e21a 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c

@@ -9,8 +9,9 @@
  */
 
 #include <math.h>
-
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -19,72 +20,28 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  if (!skip_block) {
-
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr,
-                       int zbin_oq_value, uint16_t *eob_ptr,
+                       uint16_t *eob_ptr,
                        const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
@@ -103,25 +60,69 @@
   *eob_ptr = eob + 1;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
+                              intptr_t count,
+                              int skip_block,
+                              const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              uint16_t *eob_ptr,
+                              const int16_t *scan,
+                              const int16_t *iscan) {
+  int i;
+  int eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block,
                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                              const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
-                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr,
-                             int zbin_oq_value, uint16_t *eob_ptr,
+                             uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
@@ -146,114 +147,49 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
-                      int skip_block,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)count, eob = -1;
-  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
-                         zbin_ptr[1] + zbin_oq_value };
-  const int nzbins[2] = { zbins[0] * -1,
-                          zbins[1] * -1 };
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)count - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
+    for (i = 0; i < n_coeffs; i++) {
+      uint32_t abs_qcoeff = 0;
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
-          eob = i;
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        const int64_t tmp = abs_coeff
+                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
       }
+
+      if (abs_qcoeff)
+        eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
+#endif
 
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
@@ -261,12 +197,24 @@
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
-           16, x->skip_block,
-           p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(p->qcoeff, block),
-           BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, &p->eobs[block],
+                          scan, iscan);
+    return;
+  }
+#endif
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -280,13 +228,33 @@
   *shift = 1 << (16 - l);
 }
 
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+  const int quant = vp9_dc_quant(q, 0, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case VPX_BITS_10:
+      return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case VPX_BITS_12:
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
 void vp9_init_quantizer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
@@ -295,25 +263,25 @@
         qrounding_factor_fp = 64;
 
       // y
-      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
-                     : vp9_ac_quant(q, 0);
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, 0, cm->bit_depth);
       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
       quants->y_quant_fp[q][i] = (1 << 16) / quant;
       quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->y_dequant[q][i] = quant;
+      cpi->y_dequant[q][i] = quant;
 
       // uv
-      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
-                     : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
       invert_quant(&quants->uv_quant[q][i],
                    &quants->uv_quant_shift[q][i], quant);
       quants->uv_quant_fp[q][i] = (1 << 16) / quant;
       quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->uv_dequant[q][i] = quant;
+      cpi->uv_dequant[q][i] = quant;
     }
 
     for (i = 2; i < 8; i++) {
@@ -323,7 +291,7 @@
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      cm->y_dequant[q][i] = cm->y_dequant[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
 
       quants->uv_quant[q][i] = quants->uv_quant[q][1];
       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
@@ -331,7 +299,7 @@
       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
-      cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
     }
   }
 }
@@ -343,7 +311,6 @@
   const int segment_id = xd->mi[0]->mbmi.segment_id;
   const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  const int zbin = cpi->zbin_mode_boost;
   int i;
 
   // Y
@@ -353,12 +320,10 @@
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
-  x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] *
-                                  cm->y_dequant[qindex][0];
-  x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] *
-                                  cm->y_dequant[qindex][1];
-  x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
-  xd->plane[0].dequant = cm->y_dequant[qindex];
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
+
+  x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+  x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
 
   // UV
   for (i = 1; i < 3; i++) {
@@ -368,38 +333,23 @@
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
-    x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] *
-                                    cm->y_dequant[qindex][0];
-    x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] *
-                                    cm->y_dequant[qindex][1];
-    x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
-    xd->plane[i].dequant = cm->uv_dequant[qindex];
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+
+    x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+    x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
-  x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
   x->errorperbit = rdmult >> 6;
   x->errorperbit += (x->errorperbit == 0);
 
-  vp9_initialize_me_consts(cpi, x->q_index);
-}
-
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->q_index;
-  const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                            cpi->zbin_mode_boost) >> 7;
-  const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                             cpi->zbin_mode_boost) >> 7;
-
-  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
-  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
-  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
+  vp9_initialize_me_consts(cpi, x, x->q_index);
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  cpi->zbin_mode_boost = 0;
-  vp9_init_plane_quantizers(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->td.mb);
 }
 
 void vp9_set_quantizer(VP9_COMMON *cm, int q) {

diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 262529b..6132036 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h

@@ -37,14 +37,6 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
@@ -53,8 +45,6 @@
 
 void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
-void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
 void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);

diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 290567e..4ba3406 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c

@@ -16,13 +16,15 @@
 #include <string.h>
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -42,13 +44,56 @@
 
 #define FRAME_OVERHEAD_BITS 200
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    switch (bit_depth) { \
+      case VPX_BITS_8: \
+        name = name##_8; \
+        break; \
+      case VPX_BITS_10: \
+        name = name##_10; \
+        break; \
+      case VPX_BITS_12: \
+        name = name##_12; \
+        break; \
+      default: \
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
+                    " or VPX_BITS_12"); \
+        name = NULL; \
+    } \
+  } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    (void) bit_depth; \
+    name = name##_8; \
+  } while (0)
+#endif
+
 // Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int arfgf_low_motion_minq[QINDEX_RANGE];
-static int arfgf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-static int rtc_minq[QINDEX_RANGE];
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
 static int gf_high = 2000;
 static int gf_low = 400;
 static int kf_high = 5000;
@@ -58,7 +103,8 @@
 // formulaic approach to facilitate easier adjustment of the Q tables.
 // The formulae were derived from computing a 3rd order polynomial best
 // fit to the original data (after plotting real maxq vs minq (not q index))
-static int get_minq_index(double maxq, double x3, double x2, double x1) {
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          vpx_bit_depth_t bit_depth) {
   int i;
   const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
                                 maxq);
@@ -68,53 +114,91 @@
   if (minqtarget <= 2.0)
     return 0;
 
-  for (i = 0; i < QINDEX_RANGE; i++)
-    if (minqtarget <= vp9_convert_qindex_to_q(i))
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth))
       return i;
+  }
 
   return QINDEX_RANGE - 1;
 }
 
-void vp9_rc_init_minq_luts() {
+static void init_minq_luts(int *kf_low_m, int *kf_high_m,
+                           int *arfgf_low, int *arfgf_high,
+                           int *inter, int *rtc, vpx_bit_depth_t bit_depth) {
   int i;
-
   for (i = 0; i < QINDEX_RANGE; i++) {
-    const double maxq = vp9_convert_qindex_to_q(i);
-    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125);
-    kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50);
-    arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30);
-    arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
-    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90);
-    rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70);
+    const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
   }
 }
 
+void vp9_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, VPX_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, VPX_BITS_12);
+#endif
+}
+
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
-double vp9_convert_qindex_to_q(int qindex) {
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
   // Convert the index to a real Q value (scaled down to match old Q values)
-  return vp9_ac_quant(qindex, 0) / 4.0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+    case VPX_BITS_10:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
+    case VPX_BITS_12:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1.0;
+  }
+#else
+  return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
 }
 
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
+                       double correction_factor,
+                       vpx_bit_depth_t bit_depth) {
+  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
 
   // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
   return (int)(enumerator * correction_factor / q);
 }
 
-static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                              double correction_factor) {
-  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor));
-  return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS;
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth) {
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth));
+  return MAX(FRAME_OVERHEAD_BITS,
+             (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target = MAX(rc->min_frame_bandwidth,
                                    rc->avg_frame_bandwidth >> 5);
   if (target < min_frame_target)
@@ -129,6 +213,11 @@
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth)
     target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return target;
 }
 
@@ -145,14 +234,16 @@
   return target;
 }
 
-
-// Update the buffer level for higher layers, given the encoded current layer.
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
 static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
-  int temporal_layer = 0;
+  int i = 0;
   int current_temporal_layer = svc->temporal_layer_id;
-  for (temporal_layer = current_temporal_layer + 1;
-      temporal_layer < svc->number_temporal_layers; ++temporal_layer) {
-    LAYER_CONTEXT *lc = &svc->layer_context[temporal_layer];
+  for (i = current_temporal_layer + 1;
+      i < svc->number_temporal_layers; ++i) {
+    const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                       svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
     RATE_CONTROL *lrc = &lc->rc;
     int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
         encoded_frame_size);
@@ -180,11 +271,36 @@
   rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = rc->bits_off_target;
 
-  if (cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) {
+  if (is_one_pass_cbr_svc(cpi)) {
     update_layer_buffer_level(&cpi->svc, encoded_frame_size);
   }
 }
 
+int vp9_rc_get_default_min_gf_interval(
+    int width, int height, double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return MAX(default_interval,
+               (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  return MAX(interval, min_gf_interval);
+}
+
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   int i;
 
@@ -193,13 +309,13 @@
     rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
   } else {
     rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
-                                           oxcf->best_allowed_q) / 2;
+                                       oxcf->best_allowed_q) / 2;
     rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
-                                           oxcf->best_allowed_q) / 2;
+                                         oxcf->best_allowed_q) / 2;
   }
 
   rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-  rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
 
   rc->buffer_level =    rc->starting_buffer_level;
   rc->bits_off_target = rc->starting_buffer_level;
@@ -213,7 +329,6 @@
   rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
 
-  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
   rc->next_key_frame_forced = 0;
@@ -221,17 +336,26 @@
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
-
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
 
   rc->tot_q = 0.0;
-  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);
+  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 1.0;
   }
+
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
 }
 
 int vp9_rc_drop_frame(VP9_COMP *cpi) {
@@ -274,26 +398,34 @@
 
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
 
   if (cpi->common.frame_type == KEY_FRAME) {
-    return rc->rate_correction_factors[KF_STD];
+    rcf = rc->rate_correction_factors[KF_STD];
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
-    return rc->rate_correction_factors[rf_lvl];
+    rcf = rc->rate_correction_factors[rf_lvl];
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
-        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      return rc->rate_correction_factors[GF_ARF_STD];
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
-      return rc->rate_correction_factors[INTER_NORMAL];
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
+  rcf *= rcf_mult[rc->frame_size_selector];
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   RATE_CONTROL *const rc = &cpi->rc;
 
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
   if (cpi->common.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
@@ -302,15 +434,15 @@
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
-        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
       rc->rate_correction_factors[INTER_NORMAL] = factor;
   }
 }
 
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -323,40 +455,46 @@
     return;
 
   // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
-                                                 cm->base_qindex, cm->MBs,
-                                                 rate_correction_factor);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = vp9_estimate_bits_at_q(cpi->common.frame_type,
+                                                       cm->base_qindex,
+                                                       cm->MBs,
+                                                       rate_correction_factor,
+                                                       cm->bit_depth);
+  }
   // Work out a size correction factor.
-  if (projected_size_based_on_q > 0)
-    correction_factor = (100 * cpi->rc.projected_frame_size) /
-                            projected_size_based_on_q;
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                        projected_size_based_on_q);
 
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
-  switch (damp_var) {
-    case 0:
-      adjustment_limit = 0.75;
-      break;
-    case 1:
-      adjustment_limit = 0.375;
-      break;
-    case 2:
-    default:
-      adjustment_limit = 0.25;
-      break;
-  }
+  adjustment_limit = 0.25 +
+      0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
 
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
     correction_factor = (int)(100 + ((correction_factor - 100) *
                                   adjustment_limit));
     rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
-
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
@@ -380,7 +518,7 @@
   const VP9_COMMON *const cm = &cpi->common;
   int q = active_worst_quality;
   int last_error = INT_MAX;
-  int i, target_bits_per_mb;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
   const double correction_factor = get_rate_correction_factor(cpi);
 
   // Calculate required scaling factor based on target frame size and size of
@@ -391,8 +529,17 @@
   i = active_best_quality;
 
   do {
-    const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
-                                                             correction_factor);
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cm->seg.enabled &&
+        cpi->svc.temporal_layer_id == 0 &&
+        cpi->svc.spatial_layer_id == 0) {
+      bits_per_mb_at_this_q =
+          (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
+                                                      correction_factor,
+                                                      cm->bit_depth);
+    }
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -406,6 +553,14 @@
     }
   } while (++i <= active_worst_quality);
 
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == VPX_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, MIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              MAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
   return q;
 }
 
@@ -424,12 +579,22 @@
   }
 }
 
-static int get_kf_active_quality(const RATE_CONTROL *const rc, int q) {
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
   return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q) {
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
   return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
@@ -465,18 +630,24 @@
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = rc->optimal_buffer_level >> 2;
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
+  int ambient_qp;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
   if (cm->frame_type == KEY_FRAME)
     return rc->worst_quality;
-  if (cm->current_video_frame > 1)
-    active_worst_quality = MIN(rc->worst_quality,
-                               rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
-  else
-    active_worst_quality = MIN(rc->worst_quality,
-                               rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < num_frames_weight_key) ?
+      MIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) :
+      rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = MIN(rc->worst_quality,
+                             ambient_qp * 5 / 4);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
@@ -494,12 +665,11 @@
     if (critical_level) {
       buff_lvl_step = (rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment =
-            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                  (rc->optimal_buffer_level - rc->buffer_level) /
-                  buff_lvl_step);
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
       }
-      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
+      active_worst_quality = ambient_qp + adjustment;
     }
   } else {
     // Set to worst_quality if buffer is below critical level.
@@ -516,17 +686,20 @@
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
   int q;
+  int *rtc_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
 
   if (frame_is_intra_only(cm)) {
     active_best_quality = rc->best_quality;
-    // Handle the special case for key frames forced when we have75 reached
+    // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            (last_boosted_q * 0.75));
+                                            (last_boosted_q * 0.75),
+                                            cm->bit_depth);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (cm->current_video_frame > 0) {
       // not first frame of one pass and kf_boost is set
@@ -534,7 +707,8 @@
       double q_val;
 
       active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]);
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -543,9 +717,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              !cpi->use_svc &&
@@ -559,7 +734,7 @@
     } else {
       q = active_worst_quality;
     }
-    active_best_quality = get_gf_active_quality(rc, q);
+    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -590,9 +765,10 @@
       !rc->this_key_frame_forced  &&
       !(cm->current_video_frame == 0)) {
     int qdelta = 0;
-    vp9_clear_system_state();
+    vpx_clear_system_state();
     qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                        active_worst_quality, 2.0);
+                                        active_worst_quality, 2.0,
+                                        cm->bit_depth);
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -622,7 +798,7 @@
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
                                const VP9EncoderConfig *const oxcf) {
-  static const double cq_adjust_threshold = 0.5;
+  static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
   if (oxcf->rc_mode == VPX_CQ &&
       rc->total_target_bits > 0) {
@@ -644,18 +820,19 @@
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = rc->best_quality;
-
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            last_boosted_q * 0.75);
+                                            last_boosted_q * 0.75,
+                                            cm->bit_depth);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // not first frame of one pass and kf_boost is set
@@ -663,7 +840,8 @@
       double q_val;
 
       active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]);
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -672,9 +850,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -692,7 +871,7 @@
       if (q < cq_level)
         q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
@@ -701,10 +880,10 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
       }
     } else {
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
@@ -736,18 +915,20 @@
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   {
     int qdelta = 0;
-    vp9_clear_system_state();
+    vpx_clear_system_state();
 
     // Limit Q range for the adaptive loop.
     if (cm->frame_type == KEY_FRAME &&
         !rc->this_key_frame_forced &&
         !(cm->current_video_frame == 0)) {
       qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 2.0);
+                                          active_worst_quality, 2.0,
+                                          cm->bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
                (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 1.75);
+                                          active_worst_quality, 1.75,
+                                          cm->bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
@@ -779,33 +960,71 @@
   return q;
 }
 
+int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
+  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+    1.00,  // INTER_NORMAL
+    1.00,  // INTER_HIGH
+    1.50,  // GF_ARF_LOW
+    1.75,  // GF_ARF_STD
+    2.00,  // KF_STD
+  };
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+      {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+  const VP9_COMMON *const cm = &cpi->common;
+  int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
+                                          q, rate_factor_deltas[rf_level],
+                                          cm->bit_depth);
+  return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
 static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
                                          int *bottom_index,
                                          int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
-    // Handle the special case for key frames forced when we have75 reached
+    // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
-      int qindex = rc->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                            last_boosted_q * 0.75);
-      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      double last_boosted_q;
+      int delta_qindex;
+      int qindex;
+
+      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+        qindex = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        active_best_quality = qindex;
+        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 1.25,
+                                              cm->bit_depth);
+        active_worst_quality = MIN(qindex + delta_qindex, active_worst_quality);
+
+      } else {
+        qindex = rc->last_boosted_qindex;
+        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 0.75,
+                                              cm->bit_depth);
+        active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+      }
     } else {
       // Not forced keyframe.
       double q_adj_factor = 1.0;
       double q_val;
       // Baseline value derived from cpi->active_worst_quality and kf boost.
-      active_best_quality = get_kf_active_quality(rc, active_worst_quality);
+      active_best_quality = get_kf_active_quality(rc, active_worst_quality,
+                                                  cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -817,9 +1036,10 @@
 
       // Convert the adjustment factor to a qindex delta
       // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp9_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor);
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -837,7 +1057,7 @@
       if (q < cq_level)
         q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
@@ -846,10 +1066,15 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q);
+       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode VPX_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
       }
     } else {
-      active_best_quality = get_gf_active_quality(rc, q);
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
@@ -866,51 +1091,75 @@
     }
   }
 
-  // Clip the active best and worst quality values to limits.
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  vpx_clear_system_state();
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
+      !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality = MAX(active_worst_quality + qdelta,
+                               active_best_quality);
+  }
+#endif
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
+                                            active_best_quality, 2.0,
+                                            cm->bit_depth);
+    active_best_quality = MAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
   active_best_quality = clamp(active_best_quality,
                               rc->best_quality, rc->worst_quality);
   active_worst_quality = clamp(active_worst_quality,
                                active_best_quality, rc->worst_quality);
 
-  *top_index = active_worst_quality;
-  *bottom_index = active_best_quality;
-
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
-  vp9_clear_system_state();
-  {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-      1.00,  // INTER_NORMAL
-      1.00,  // INTER_HIGH
-      1.50,  // GF_ARF_LOW
-      1.75,  // GF_ARF_STD
-      2.00,  // KF_STD
-    };
-    const double rate_factor =
-      rate_factor_deltas[gf_group->rf_level[gf_group->index]];
-    int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                            active_worst_quality, rate_factor);
-    *top_index = active_worst_quality + qdelta;
-    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
-  }
-#endif
-
   if (oxcf->rc_mode == VPX_Q) {
     q = active_best_quality;
   // Special case code to try and match quality with forced key frames.
-  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
-    q = rc->last_boosted_qindex;
+  } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) &&
+             rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = MIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = rc->last_boosted_qindex;
+    }
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                           active_best_quality, active_worst_quality);
-    if (q > *top_index) {
+    if (q > active_worst_quality) {
       // Special case when we are targeting the max allowed rate.
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
-        *top_index = q;
+        active_worst_quality = q;
       else
-        q = *top_index;
+        q = active_worst_quality;
     }
   }
+  clamp(q, active_best_quality, active_worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
 
   assert(*top_index <= rc->worst_quality &&
          *top_index >= rc->best_quality);
@@ -966,6 +1215,12 @@
 
   rc->this_frame_target = target;
 
+  // Modify frame size target when down-scaling.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      rc->frame_size_selector != UNSCALED)
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
+
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
                              (cm->width * cm->height);
@@ -991,11 +1246,9 @@
     // this frame refreshes means next frames don't unless specified by user
     rc->frames_since_golden = 0;
 
-    if (cpi->oxcf.pass == 2) {
-      if (!rc->source_alt_ref_pending &&
-          cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
-      rc->source_alt_ref_active = 0;
-    } else if (!rc->source_alt_ref_pending) {
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag.
+    if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
 
@@ -1018,19 +1271,33 @@
   RATE_CONTROL *const rc = &cpi->rc;
   const int qindex = cm->base_qindex;
 
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    vp9_cyclic_refresh_postencode(cpi);
+  }
+
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
-            oxcf->rc_mode == VPX_CBR) ? 2 : 0);
+  vp9_rc_update_rate_correction_factors(cpi);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
     rc->last_q[KEY_FRAME] = qindex;
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+    if (cpi->use_svc) {
+      int i = 0;
+      SVC *svc = &cpi->svc;
+      for (i = 0; i < svc->number_temporal_layers; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->last_q[KEY_FRAME] = rc->last_q[KEY_FRAME];
+        lrc->avg_frame_qindex[KEY_FRAME] = rc->avg_frame_qindex[KEY_FRAME];
+      }
+    }
   } else {
     if (rc->is_src_frame_alt_ref ||
         !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
@@ -1039,7 +1306,7 @@
       rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
       rc->ni_frames++;
-      rc->tot_q += vp9_convert_qindex_to_q(qindex);
+      rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       rc->avg_q = rc->tot_q / rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
@@ -1054,11 +1321,14 @@
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
+  if (cm->frame_type == KEY_FRAME)
+    rc->last_kf_qindex = qindex;
 
   update_buffer_level(cpi, rc->projected_frame_size);
 
@@ -1081,13 +1351,15 @@
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
-      (cm->frame_type != KEY_FRAME))
-    // Update the alternate reference frame stats as appropriate.
-    update_alt_ref_frame_stats(cpi);
-  else
-    // Update the Golden frame stats as appropriate.
-    update_golden_frame_stats(cpi);
+  if (!cpi->use_svc) {
+    if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+        (cm->frame_type != KEY_FRAME))
+      // Update the alternate reference frame stats as appropriate.
+      update_alt_ref_frame_stats(cpi);
+    else
+      // Update the Golden frame stats as appropriate.
+      update_golden_frame_stats(cpi);
+  }
 
   if (cm->frame_type == KEY_FRAME)
     rc->frames_since_key = 0;
@@ -1095,14 +1367,22 @@
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
+
+  // Trigger the resizing of the next frame if it is scaled.
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   // Update buffer level with zero size, update frame counters, and return.
   update_buffer_level(cpi, 0);
-  cpi->common.last_frame_type = cpi->common.frame_type;
   cpi->rc.frames_since_key++;
   cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
 }
 
 // Use this macro to turn on/off use of alt-refs in one-pass mode.
@@ -1152,11 +1432,15 @@
     cm->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
-    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
       rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -1175,14 +1459,26 @@
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target = rc->avg_frame_bandwidth;
-  if (svc->number_temporal_layers > 1 &&
-      oxcf->rc_mode == VPX_CBR) {
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target =  cpi->refresh_golden_frame ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+  if (is_one_pass_cbr_svc(cpi)) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
-    int current_temporal_layer = svc->temporal_layer_id;
-    const LAYER_CONTEXT *lc = &svc->layer_context[current_temporal_layer];
+    int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id,
+            svc->temporal_layer_id, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
     target = lc->avg_frame_size;
     min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
   }
@@ -1195,6 +1491,11 @@
     const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return MAX(min_frame_target, target);
 }
 
@@ -1208,11 +1509,13 @@
       ? INT_MAX : (int)(rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
-    double framerate = oxcf->framerate;
+    double framerate = cpi->framerate;
     if (svc->number_temporal_layers > 1 &&
         oxcf->rc_mode == VPX_CBR) {
       // Use the layer framerate for temporal layers CBR mode.
-      const LAYER_CONTEXT *lc = &svc->layer_context[svc->temporal_layer_id];
+      const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id,
+          svc->temporal_layer_id, svc->number_temporal_layers);
+      const LAYER_CONTEXT *lc = &svc->layer_context[layer];
       framerate = lc->framerate;
     }
     kf_boost = MAX(kf_boost, (int)(2 * framerate - 16));
@@ -1225,10 +1528,27 @@
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
+// Reset information needed to set proper reference frames and buffer updates
+// for temporal layering. This is called when a key frame is encoded.
+static void reset_temporal_layer_to_zero(VP9_COMP *cpi) {
+  int sl;
+  LAYER_CONTEXT *lc = NULL;
+  cpi->svc.temporal_layer_id = 0;
+
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
+    lc->current_video_frame_in_layer = 0;
+    lc->frames_from_key_frame = 0;
+  }
+}
+
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target = rc->avg_frame_bandwidth;
+  const int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+      cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
+
   if ((cm->current_video_frame == 0) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key && (rc->frames_since_key %
@@ -1236,34 +1556,49 @@
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
 
-    if (is_spatial_svc(cpi)) {
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
+    if (is_two_pass_svc(cpi)) {
+      cpi->svc.layer_context[layer].is_key_frame = 1;
       cpi->ref_frame_flags &=
           (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-    }
-
-    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    } else if (is_one_pass_cbr_svc(cpi)) {
+      cpi->svc.layer_context[layer].is_key_frame = 1;
+      reset_temporal_layer_to_zero(cpi);
+      cpi->ref_frame_flags &=
+                (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+      // Assumption here is that LAST_FRAME is being updated for a keyframe.
+      // Thus no change in update flags.
       target = calc_iframe_target_size_one_pass_cbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
-
-    if (is_spatial_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+    if (is_two_pass_svc(cpi)) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
-        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+        lc->is_key_frame =
+            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
         if (lc->is_key_frame)
           cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
       }
       cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    }
-
-    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    } else if (is_one_pass_cbr_svc(cpi)) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      if (cpi->svc.spatial_layer_id == 0) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame =
+            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
+      }
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
   rc->frames_till_gf_update_due = INT_MAX;
   rc->baseline_gf_interval = INT_MAX;
@@ -1284,18 +1619,42 @@
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
-    target = calc_iframe_target_size_one_pass_cbr(cpi);
   } else {
     cm->frame_type = INTER_FRAME;
-    target = calc_pframe_target_size_one_pass_cbr(cpi);
   }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
-  // Don't use gf_update by default in CBR mode.
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+    cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
+  else
+    cpi->resize_pending = 0;
 }
 
-int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) {
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth) {
   int start_index = rc->worst_quality;
   int target_index = rc->worst_quality;
   int i;
@@ -1303,14 +1662,14 @@
   // Convert the average q value to an index.
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     start_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qstart)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart)
       break;
   }
 
   // Convert the q target to an index
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qtarget)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget)
       break;
   }
 
@@ -1318,36 +1677,45 @@
 }
 
 int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio) {
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth) {
   int target_index = rc->worst_quality;
   int i;
 
   // Look up the current projected bits per block for the base index
-  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0);
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0,
+                                                  bit_depth);
 
   // Find the target bits per mb based on the base value and given ratio.
   const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
   // Convert the q target to an index
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    target_index = i;
-    if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
       break;
+    }
   }
-
   return target_index - qindex;
 }
 
-void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
-                                RATE_CONTROL *const rc) {
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
   // Set Maximum gf/arf interval
-  rc->max_gf_interval = 16;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, cpi->framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+        cpi->framerate, rc->min_gf_interval);
 
   // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
-  if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2))
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+  rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
 
   if (is_altref_enabled(cpi)) {
     if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
@@ -1356,6 +1724,9 @@
 
   if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
     rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+  // Clamp min to max
+  rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1364,7 +1735,7 @@
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate);
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
                                 oxcf->two_pass_vbrmin_section / 100);
 
@@ -1382,5 +1753,233 @@
   rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
                                     vbr_max_bits);
 
-  vp9_rc_set_gf_max_interval(cpi, rc);
+  vp9_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target +=
+      (vbr_bits_off_target > max_delta) ? max_delta
+                                        : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -=
+      (vbr_bits_off_target < -max_delta) ? max_delta
+                                         : (int)-vbr_bits_off_target;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = MAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits =
+      (int)MIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)MIN(fast_extra_bits,
+      MAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void vp9_set_target_rate(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  vp9_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int resize_now = 0;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = (int)(5 * cpi->framerate);
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count >= window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficent amount in past
+      // window, and we are at original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state.
+      if (cpi->resize_state == 0 &&
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+        resize_now = 1;
+        cpi->resize_state = 1;
+      } else if (cpi->resize_state == 1 &&
+                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
+        resize_now = -1;
+        cpi->resize_state = 0;
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_now != 0) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    // For now, resize is by 1/2 x 1/2.
+    cpi->resize_scale_num = 1;
+    cpi->resize_scale_den = 2;
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+        (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Reset cyclic refresh parameters.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+      vp9_cyclic_refresh_reset_resize(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_now == 1) ?
+        rc->this_frame_target * tot_scale_change :
+        rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = vp9_rc_regulate_q(cpi,
+                               target_bits_per_frame,
+                               rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_now == 1 &&
+        qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_now == -1 &&
+       qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_now;
+}
+
+// Compute average source sad (temporal sad: between current source and
+// previous source) over a subset of superblocks. Use this is detect big changes
+// in content and allow rate control to react.
+// TODO(marpan): Superblock sad is computed again in variance partition for
+// non-rd mode (but based on last reconstructed frame). Should try to reuse
+// these computations.
+void vp9_avg_source_sad(VP9_COMP *cpi) {
+  VP9_COMMON * const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->high_source_sad = 0;
+  if (cpi->Last_Source != NULL) {
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+    const int last_src_ystride = cpi->Last_Source->y_stride;
+    int sbi_row, sbi_col;
+    const BLOCK_SIZE bsize = BLOCK_64X64;
+    // Loop over sub-sample of frame, and compute average sad over 64x64 blocks.
+    uint64_t avg_sad = 0;
+    int num_samples = 0;
+    int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    for (sbi_row = 0; sbi_row < sb_rows; sbi_row ++) {
+      for (sbi_col = 0; sbi_col < sb_cols; sbi_col ++) {
+        // Checker-board pattern, ignore boundary.
+        if ((sbi_row > 0 && sbi_col > 0) &&
+            (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+            ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+            (sbi_row % 2 != 0 && sbi_col % 2 != 0))) {
+          num_samples++;
+          avg_sad += cpi->fn_ptr[bsize].sdf(src_y,
+                                            src_ystride,
+                                            last_src_y,
+                                            last_src_ystride);
+        }
+        src_y += 64;
+        last_src_y += 64;
+      }
+      src_y += (src_ystride << 6) - (sb_cols << 6);
+      last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+    }
+    if (num_samples > 0)
+      avg_sad = avg_sad / num_samples;
+    // Set high_source_sad flag if we detect very high increase in avg_sad
+    // between current and the previous frame value(s). Use a minimum threshold
+    // for cases where there is small change from content that is completely
+    // static.
+    if (avg_sad > MAX(4000, (rc->avg_source_sad << 3)) &&
+        rc->frames_since_key > 1)
+      rc->high_source_sad = 1;
+    else
+      rc->high_source_sad = 0;
+    rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+  }
+}
+
+// Test if encoded frame will significantly overshoot the target bitrate, and
+// if so, set the QP, reset/adjust some rate control parameters, and return 1.
+int vp9_encodedframe_overshoot(VP9_COMP *cpi,
+                               int frame_size,
+                               int *q) {
+  VP9_COMMON * const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int thresh_qp = 3 * (rc->worst_quality >> 2);
+  int thresh_rate = rc->avg_frame_bandwidth * 10;
+  if (cm->base_qindex < thresh_qp &&
+      frame_size > thresh_rate) {
+    // Force a re-encode, and for now use max-QP.
+    *q = cpi->rc.worst_quality;
+    // Adjust avg_frame_qindex and buffer_level, as these parameters will affect
+    // QP selection for subsequent frames. If they have settled down to a very
+    // different (low QP) state, then not re-adjusting them may cause next
+    // frame to select low QP and overshoot again.
+    // TODO(marpan): Check if rate correction factor should also be adjusted.
+    cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    return 1;
+  } else {
+    return 0;
+  }
 }

diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 456daf4..11dfa35 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h

@@ -12,6 +12,7 @@
 #ifndef VP9_ENCODER_VP9_RATECTRL_H_
 #define VP9_ENCODER_VP9_RATECTRL_H_
 
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_blockd.h"
@@ -23,6 +24,9 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
+#define MIN_GF_INTERVAL     4
+#define MAX_GF_INTERVAL     16
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -32,6 +36,27 @@
   RATE_FACTOR_LEVELS = 5
 } RATE_FACTOR_LEVEL;
 
+// Internal frame scaling level.
+typedef enum {
+  UNSCALED = 0,     // Frame is unscaled.
+  SCALE_STEP1 = 1,  // First-level down-scaling.
+  FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = {16, 24};
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
 typedef struct {
   // Rate targetting variables
   int base_frame_target;           // A baseline frame target before adjustment
@@ -41,6 +66,7 @@
   int sb64_target_rate;
   int last_q[FRAME_TYPES];         // Separate values for Intra/Inter
   int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+  int last_kf_qindex;              // Q index of the last key frame coded.
 
   int gfu_boost;
   int last_boost;
@@ -50,9 +76,11 @@
 
   int frames_since_golden;
   int frames_till_gf_update_due;
+  int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
   int baseline_gf_interval;
+  int constrained_gf_group;
   int frames_to_key;
   int frames_since_key;
   int this_key_frame_forced;
@@ -75,6 +103,7 @@
   int64_t buffer_level;
   int64_t bits_off_target;
   int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
 
   int decimation_factor;
   int decimation_count;
@@ -85,6 +114,8 @@
   int long_rolling_target_bits;
   int long_rolling_actual_bits;
 
+  int rate_error_estimate;
+
   int64_t total_actual_bits;
   int64_t total_target_bits;
   int64_t total_target_vs_actual;
@@ -95,7 +126,25 @@
   int64_t starting_buffer_level;
   int64_t optimal_buffer_level;
   int64_t maximum_buffer_size;
-  // int active_best_quality;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  FRAME_SCALE_LEVEL frame_size_selector;
+  FRAME_SCALE_LEVEL next_frame_size_selector;
+  int frame_width[FRAME_SCALE_STEPS];
+  int frame_height[FRAME_SCALE_STEPS];
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
+
+  uint64_t avg_source_sad;
+  int high_source_sad;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -104,9 +153,19 @@
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
-double vp9_convert_qindex_to_q(int qindex);
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth);
 
-void vp9_rc_init_minq_luts();
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
+
+void vp9_rc_init_minq_luts(void);
+
+int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -144,7 +203,7 @@
 
 // Updates rate correction factors
 // Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
@@ -167,7 +226,7 @@
 
 // Estimates bits per mb for a given qindex and correction factor.
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                       double correction_factor);
+                       double correction_factor, vpx_bit_depth_t bit_depth);
 
 // Clamping utilities for bitrate targets for iframes and pframes.
 int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
@@ -180,17 +239,29 @@
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a target q value
-int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget);
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth);
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to the given rate ratio.
 int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
-                               int qindex, double rate_target_ratio);
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth);
+
+int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q);
 
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
-void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi,
-                                RATE_CONTROL *const rc);
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void vp9_set_target_rate(struct VP9_COMP *cpi);
+
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
+void vp9_avg_source_sad(struct VP9_COMP *cpi);
+
+int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c
index 4fc3e9e..2f2f7c1 100644
--- a/libvpx/vp9/encoder/vp9_rd.c
+++ b/libvpx/vp9/encoder/vp9_rd.c

@@ -15,6 +15,9 @@
 #include "./vp9_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -25,7 +28,6 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -36,7 +38,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
 
 #define RD_THRESH_POW      1.25
 #define RD_MULT_EPB_RATIO  64
@@ -44,6 +45,18 @@
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+void vp9_rd_cost_reset(RD_COST *rd_cost) {
+  rd_cost->rate = INT_MAX;
+  rd_cost->dist = INT64_MAX;
+  rd_cost->rdcost = INT64_MAX;
+}
+
+void vp9_rd_cost_init(RD_COST *rd_cost) {
+  rd_cost->rate = 0;
+  rd_cost->dist = 0;
+  rd_cost->rdcost = 0;
+}
+
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
@@ -53,7 +66,7 @@
 };
 
 static void fill_mode_costs(VP9_COMP *cpi) {
-  const FRAME_CONTEXT *const fc = &cpi->common.fc;
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; ++i)
@@ -81,7 +94,7 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            vp9_prob probs[ENTROPY_NODES];
+            vpx_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);
@@ -93,34 +106,69 @@
 }
 
 // Values are now correlated to quantizer.
-static int sad_per_bit16lut[QINDEX_RANGE];
-static int sad_per_bit4lut[QINDEX_RANGE];
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
 
-void vp9_init_me_luts() {
+#if CONFIG_VP9_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            vpx_bit_depth_t bit_depth) {
   int i;
-
   // Initialize the sad lut tables using a formulaic calculation for now.
   // This is to make it easier to resolve the impact of experimental changes
   // to the quantizer tables.
-  for (i = 0; i < QINDEX_RANGE; ++i) {
-    const double q = vp9_convert_qindex_to_q(i);
-    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
-    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
+  for (i = 0; i < range; i++) {
+    const double q = vp9_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
+void vp9_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  VPX_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  VPX_BITS_12);
+#endif
+}
+
 static const int rd_boost_factor[16] = {
   64, 32, 32, 32, 24, 16, 12, 12,
   8, 8, 4, 4, 2, 2, 1, 0
 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-128, 144, 128, 128, 144
+  128, 144, 128, 128, 144
 };
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
-  const int q = vp9_dc_quant(qindex, 0);
-  int rdmult = 88 * q * q / 24;
-
+  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t rdmult = 0;
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      rdmult = 88 * q * q / 24;
+      break;
+    case VPX_BITS_10:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4);
+      break;
+    case VPX_BITS_12:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  int64_t rdmult = 88 * q * q / 24;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -129,18 +177,59 @@
     rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
     rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
   }
-  return rdmult;
+  if (rdmult < 1)
+    rdmult = 1;
+  return (int)rdmult;
 }
 
-static int compute_rd_thresh_factor(int qindex) {
+static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
+  double q;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+      break;
+    case VPX_BITS_10:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0;
+      break;
+    case VPX_BITS_12:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(debargha): Adjust the function below.
-  const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
-  return MAX(q, 8);
+  return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
-void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
-  cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
-  cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
+void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case VPX_BITS_10:
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case VPX_BITS_12:
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+  }
+#else
+  (void)cpi;
+  x->sadperbit16 = sad_per_bit16lut_8[qindex];
+  x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
@@ -149,9 +238,8 @@
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
     const int qindex =
         clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
-                  cm->y_dc_delta_q,
-              0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex);
+              cm->y_dc_delta_q, 0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
@@ -178,11 +266,12 @@
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
@@ -194,12 +283,15 @@
                        cm->frame_type != KEY_FRAME) ? 0 : 1;
 
   set_block_thresholds(cm, rd);
+  set_partition_probs(cm, xd);
 
-  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
-    fill_token_costs(x->token_costs, cm->fc.coef_probs);
+  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+    fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
+  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+      cm->frame_type == KEY_FRAME) {
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
+      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
                       vp9_partition_tree);
   }
 
@@ -211,11 +303,11 @@
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc, cm->allow_high_precision_mv);
+                               &cm->fc->nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+                        cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
     }
   }
 }
@@ -295,7 +387,7 @@
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -311,10 +403,10 @@
     int d_q10, r_q10;
     static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
-        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = (n * r_q10 + 2) >> 2;
+    *rate = ((r_q10 << n_log2) + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }
@@ -332,8 +424,8 @@
   int i;
   switch (tx_size) {
     case TX_4X4:
-      vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
       break;
     case TX_8X8:
       for (i = 0; i < num_4x4_w; i += 2)
@@ -362,49 +454,47 @@
 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int_mv this_mv;
   int i;
   int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
-
+  int near_same_nearest;
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
-  int row_offset, col_offset;
-  int num_mv_refs = MAX_MV_REF_CANDIDATES +
+  const int num_mv_refs = MAX_MV_REF_CANDIDATES +
                     (cpi->sf.adaptive_motion_search &&
-                     cpi->common.show_frame &&
-                     block_size < cpi->sf.max_partition_size);
+                     block_size < x->max_partition_size);
 
   MV pred_mv[3];
-  pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv;
-  pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv;
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
   pred_mv[2] = x->pred_mv[ref_frame];
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
+  near_same_nearest =
+      x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+          x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
-    this_mv.as_mv = pred_mv[i];
+    const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
 
-    max_mv = MAX(max_mv,
-                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
-    // Only need to check zero mv once.
-    if (!this_mv.as_int && zero_seen)
+    if (i == 1 && near_same_nearest)
       continue;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
 
-    zero_seen = zero_seen || !this_mv.as_int;
+    if (fp_row ==0 && fp_col == 0 && zero_seen)
+      continue;
+    zero_seen |= (fp_row ==0 && fp_col == 0);
 
-    row_offset = this_mv.as_mv.row >> 3;
-    col_offset = this_mv.as_mv.col >> 3;
-    ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
-
+    ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
                                            ref_y_ptr, ref_y_stride);
-
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
@@ -439,16 +529,31 @@
   }
 }
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
-                                                   int ref_frame) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
 }
 
-int vp9_get_switchable_rate(const VP9_COMP *cpi) {
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                             int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+          &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
+}
+
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
@@ -462,11 +567,17 @@
 
   // Set baseline threshold values.
   for (i = 0; i < MAX_MODES; ++i)
-    rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
-  rd->thresh_mult[THR_NEARESTMV] = 0;
-  rd->thresh_mult[THR_NEARESTG] = 0;
-  rd->thresh_mult[THR_NEARESTA] = 0;
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+    rd->thresh_mult[THR_NEARESTA] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+    rd->thresh_mult[THR_NEARESTA] = 0;
+  }
 
   rd->thresh_mult[THR_DC] += 1000;
 
@@ -474,10 +585,6 @@
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
-  // Adjust threshold only in real time mode, which only uses last
-  // reference frame.
-  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
-
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
@@ -505,74 +612,56 @@
   rd->thresh_mult[THR_D153_PRED] += 2500;
   rd->thresh_mult[THR_D207_PRED] += 2500;
   rd->thresh_mult[THR_D63_PRED] += 2500;
-
-  // Disable frame modes if flags not set.
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
-    rd->thresh_mult[THR_NEWMV    ] = INT_MAX;
-    rd->thresh_mult[THR_NEARESTMV] = INT_MAX;
-    rd->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-    rd->thresh_mult[THR_NEARMV   ] = INT_MAX;
-  }
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
-    rd->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    rd->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    rd->thresh_mult[THR_NEARG    ] = INT_MAX;
-    rd->thresh_mult[THR_NEWG     ] = INT_MAX;
-  }
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
-    rd->thresh_mult[THR_NEARESTA ] = INT_MAX;
-    rd->thresh_mult[THR_ZEROA    ] = INT_MAX;
-    rd->thresh_mult[THR_NEARA    ] = INT_MAX;
-    rd->thresh_mult[THR_NEWA     ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
-    rd->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-  }
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
-    rd->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    rd->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-  }
 }
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
+  static const int thresh_mult[2][MAX_REFS] =
+      {{2500, 2500, 2500, 4500, 4500, 2500},
+       {2000, 2000, 2000, 4000, 4000, 2000}};
   RD_OPT *const rd = &cpi->rd;
-  int i;
-
-  for (i = 0; i < MAX_REFS; ++i)
-    rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode)  ? -500 : 0;
-
-  rd->thresh_mult_sub8x8[THR_LAST] += 2500;
-  rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
-  rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
-  rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
-  rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
-  rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
-
-  // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; ++i)
-    if (sf->disable_split_mask & (1 << i))
-      rd->thresh_mult_sub8x8[i] = INT_MAX;
-
-  // Disable mode test if frame flag is not set.
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
-    rd->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
-    rd->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
-    rd->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_LAST_FLAG | VP9_ALT_FLAG))
-    rd->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_GOLD_FLAG | VP9_ALT_FLAG))
-    rd->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
+  const int idx = cpi->oxcf.mode == BEST;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
 }
+
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = MIN(*fact + RD_THRESH_INC,
+                      rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
+}
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth) {
+  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return 20 * q;
+    case VPX_BITS_10:
+      return 5 * q;
+    case VPX_BITS_12:
+      return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+

diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h
index eeb5e0f..28385c9 100644
--- a/libvpx/vp9/encoder/vp9_rd.h
+++ b/libvpx/vp9/encoder/vp9_rd.h

@@ -36,6 +36,9 @@
 #define MAX_MODES 30
 #define MAX_REFS  6
 
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -51,6 +54,12 @@
 
   THR_NEARMV,
   THR_NEARA,
+  THR_NEARG,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+
   THR_COMP_NEARESTLA,
   THR_COMP_NEARESTGA,
 
@@ -58,13 +67,9 @@
 
   THR_COMP_NEARLA,
   THR_COMP_NEWLA,
-  THR_NEARG,
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
 
-  THR_ZEROMV,
-  THR_ZEROG,
-  THR_ZEROA,
   THR_COMP_ZEROLA,
   THR_COMP_ZEROGA,
 
@@ -96,24 +101,28 @@
   int thresh_mult_sub8x8[MAX_REFS];
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
 
-  int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
-  int64_t tx_select_diff[TX_MODES];
-  // TODO(agrange): can this overflow?
-  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
 
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t mask_filter;
 
   int RDMULT;
   int RDDIV;
 } RD_OPT;
 
+typedef struct RD_COST {
+  int rate;
+  int64_t dist;
+  int64_t rdcost;
+} RD_COST;
+
+// Reset the rate distortion cost values to maximum (invalid) value.
+void vp9_rd_cost_reset(RD_COST *rd_cost);
+// Initialize the rate distortion cost values to zero.
+void vp9_rd_cost_init(RD_COST *rd_cost);
+
 struct TileInfo;
+struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
@@ -121,18 +130,25 @@
 
 void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
-void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex);
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist);
 
-int vp9_get_switchable_rate(const struct VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+                            const MACROBLOCKD *const xd);
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
-                                                   int ref_frame);
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride);
 
-void vp9_init_me_luts();
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base);
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+                                             int ref_frame);
+
+void vp9_init_me_luts(void);
 
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
@@ -143,6 +159,9 @@
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
+void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
+
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
@@ -158,6 +177,10 @@
                           int mi_row, int mi_col,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index e368037..96c6474 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c

@@ -12,8 +12,11 @@
 #include <math.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -24,8 +27,8 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -36,16 +39,19 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << INTRA_FRAME))
 
-#define LAST_FRAME_MODE_MASK    0xFFEDCD60
-#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
-#define ALT_REF_MODE_MASK       0xFFC648D0
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
 
 #define MIN_EARLY_TERM_INDEX    3
+#define NEW_MV_DISCOUNT_FACTOR  8
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -60,19 +66,18 @@
   MACROBLOCK *x;
   ENTROPY_CONTEXT t_above[16];
   ENTROPY_CONTEXT t_left[16];
-  int rate;
-  int64_t dist;
-  int64_t sse;
   int this_rate;
   int64_t this_dist;
   int64_t this_sse;
   int64_t this_rd;
   int64_t best_rd;
-  int skip;
+  int exit_early;
   int use_fast_coef_costing;
   const scan_order *so;
+  uint8_t skippable;
 };
 
+#define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
   {NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -86,6 +91,12 @@
 
   {NEARMV,    {LAST_FRAME,   NONE}},
   {NEARMV,    {ALTREF_FRAME, NONE}},
+  {NEARMV,    {GOLDEN_FRAME, NONE}},
+
+  {ZEROMV,    {LAST_FRAME,   NONE}},
+  {ZEROMV,    {GOLDEN_FRAME, NONE}},
+  {ZEROMV,    {ALTREF_FRAME, NONE}},
+
   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
 
@@ -93,13 +104,9 @@
 
   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
-  {NEARMV,    {GOLDEN_FRAME, NONE}},
   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 
-  {ZEROMV,    {LAST_FRAME,   NONE}},
-  {ZEROMV,    {GOLDEN_FRAME, NONE}},
-  {ZEROMV,    {ALTREF_FRAME, NONE}},
   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 
@@ -122,19 +129,6 @@
   {{INTRA_FRAME,  NONE}},
 };
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                            int m, int n, int min_plane, int max_plane) {
   int i;
@@ -162,7 +156,8 @@
 
 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int64_t *out_dist_sum) {
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -171,36 +166,85 @@
   int64_t dist_sum = 0;
   const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
-  const int shift = 8;
+  unsigned int var = 0;
+  unsigned int sum_sse = 0;
+  int64_t total_sse = 0;
+  int skip_flag = 1;
+  const int shift = 6;
+  int rate;
+  int64_t dist;
+  const int dequant_shift =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+          xd->bd - 5 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          3;
+
+  x->pred_sse[ref] = 0;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    const int64_t dc_thr = p->quant_thred[0] >> shift;
+    const int64_t ac_thr = p->quant_thred[1] >> shift;
+    // The low thresholds are used to measure if the prediction errors are
+    // low enough so that we can skip the mode search.
+    const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
+    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int idx, idy;
+    int lw = b_width_log2_lookup[unit_size] + 2;
+    int lh = b_height_log2_lookup[unit_size] + 2;
 
-    const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                                                pd->dst.buf, pd->dst.stride,
-                                                &sse);
+    sum_sse = 0;
 
-    if (!x->select_tx_size) {
-      if (sse < p->quant_thred[0] >> shift)
-        x->skip_txfm[i] = 1;
-      else if (var < p->quant_thred[1] >> shift)
-        x->skip_txfm[i] = 2;
-      else
-        x->skip_txfm[i] = 0;
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+        int block_idx = (idy << 1) + idx;
+        int low_err_skip = 0;
+
+        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+                                        dst, pd->dst.stride, &sse);
+        x->bsse[(i << 2) + block_idx] = sse;
+        sum_sse += sse;
+
+        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
+        if (!x->select_tx_size) {
+          // Check if all ac coefficients can be quantized to zero.
+          if (var < ac_thr || var == 0) {
+            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
+
+            // Check if dc coefficient can be quantized to zero.
+            if (sse - var < dc_thr || sse == var) {
+              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
+
+              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
+                low_err_skip = 1;
+            }
+          }
+        }
+
+        if (skip_flag && !low_err_skip)
+          skip_flag = 0;
+
+        if (i == 0)
+          x->pred_sse[ref] += sse;
+      }
     }
 
-    x->bsse[i] = sse;
-    if (i == 0)
-      x->pred_sse[ref] = sse;
+    total_sse += sum_sse;
 
     // Fast approximate the modelling function.
-    if (cpi->oxcf.speed > 4) {
+    if (cpi->sf.simple_model_rd_from_var) {
       int64_t rate;
-      int64_t dist;
-      int64_t square_error = sse;
-      int quantizer = (pd->dequant[1] >> 3);
+      const int64_t square_error = sum_sse;
+      int quantizer = (pd->dequant[1] >> dequant_shift);
 
       if (quantizer < 120)
         rate = (square_error * (280 - quantizer)) >> 8;
@@ -210,20 +254,21 @@
       rate_sum += rate;
       dist_sum += dist;
     } else {
-      int rate;
-      int64_t dist;
-      vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> 3, &rate, &dist);
+      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
+                                   pd->dequant[1] >> dequant_shift,
+                                   &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
     }
   }
 
+  *skip_txfm_sb = skip_flag;
+  *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
@@ -238,6 +283,43 @@
   return error;
 }
 
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
@@ -249,12 +331,12 @@
   { 1, 2, 3, 4, 11,  256 - 21, 0 },
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
-static INLINE int cost_coeffs(MACROBLOCK *x,
-                              int plane, int block,
-                              ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                              TX_SIZE tx_size,
-                              const int16_t *scan, const int16_t *nb,
-                              int use_fast_coef_costing) {
+static int cost_coeffs(MACROBLOCK *x,
+                       int plane, int block,
+                       ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                       TX_SIZE tx_size,
+                       const int16_t *scan, const int16_t *nb,
+                       int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
@@ -262,12 +344,18 @@
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
-  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t token_cache[32 * 32];
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
@@ -281,23 +369,29 @@
 
     // dc token
     int v = qcoeff[0];
-    int prev_t = vp9_dct_value_tokens_ptr[v].token;
-    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    int16_t prev_t;
+    EXTRABIT e;
+    vp9_get_token_extra(v, &prev_t, &e);
+    cost = (*token_costs)[0][pt][prev_t] +
+        vp9_get_cost(prev_t, e, cat6_high_cost);
+
     token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
     for (c = 1; c < eob; c++) {
       const int rc = scan[c];
-      int t;
+      int16_t t;
 
       v = qcoeff[rc];
-      t = vp9_dct_value_tokens_ptr[v].token;
+      vp9_get_token_extra(v, &t, &e);
       if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][!prev_t][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
       } else {
         pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][pt][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
         token_cache[rc] = vp9_pt_energy_class[t];
       }
       prev_t = t;
@@ -323,39 +417,50 @@
 
   return cost;
 }
-static void dist_block(int plane, int block, TX_SIZE tx_size,
-                       struct rdcost_block_args* args) {
+
+static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+                       int64_t *out_dist, int64_t *out_sse) {
   const int ss_txfrm_size = tx_size << 1;
-  MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
   int shift = tx_size == TX_32X32 ? 0 : 2;
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                               &this_sse) >> shift;
-  args->sse  = this_sse >> shift;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+  *out_dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                     &this_sse, bd) >> shift;
+#else
+  *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                              &this_sse) >> shift;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  *out_sse = this_sse >> shift;
 
   if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
-                    (1 << ss_txfrm_size)) >> (shift + 2);
-    args->dist += (p >> 4);
-    args->sse  += p;
+                    (1 << ss_txfrm_size)) >>
+#if CONFIG_VP9_HIGHBITDEPTH
+                        (shift + 2 + (bd - 8) * 2);
+#else
+                        (shift + 2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    *out_dist += (p >> 4);
+    *out_sse  += p;
   }
 }
 
-static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                       TX_SIZE tx_size, struct rdcost_block_args* args) {
+static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                      TX_SIZE tx_size, struct rdcost_block_args* args) {
   int x_idx, y_idx;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 
-  args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
-                           args->t_left + y_idx, tx_size,
-                           args->so->scan, args->so->neighbors,
-                           args->use_fast_coef_costing);
+  return cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+                     args->t_left + y_idx, tx_size,
+                     args->so->scan, args->so->neighbors,
+                     args->use_fast_coef_costing);
 }
 
 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -365,39 +470,65 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int64_t rd1, rd2, rd;
+  int rate;
+  int64_t dist;
+  int64_t sse;
 
-  if (args->skip)
+  if (args->exit_early)
     return;
 
   if (!is_inter_block(mbmi)) {
-    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
-    dist_block(plane, block, tx_size, args);
-  } else {
-    if (x->skip_txfm[plane] == 0) {
+    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+        SKIP_TXFM_NONE) {
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-      dist_block(plane, block, tx_size, args);
-    } else if (x->skip_txfm[plane] == 2) {
+      dist_block(x, plane, block, tx_size, &dist, &sse);
+    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+               SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
-      int16_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
-      int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-      args->sse  = x->bsse[plane] << 4;
-      args->dist = args->sse;
-      if (!x->plane[plane].eobs[block])
-        args->dist = args->sse - ((coeff[0] * coeff[0] -
-            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2);
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
+      if (x->plane[plane].eobs[block]) {
+        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+        const int64_t resd_sse = coeff[0] - dqcoeff[0];
+        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+        dc_correct >>= ((xd->bd - 8) * 2);
+#endif
+        if (tx_size != TX_32X32)
+          dc_correct >>= 2;
+
+        dist = MAX(0, sse - dc_correct);
+      }
     } else {
+      // SKIP_TXFM_AC_DC
       // skip forward transform
       x->plane[plane].eobs[block] = 0;
-      args->sse  = x->bsse[plane] << 4;
-      args->dist = args->sse;
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
     }
+  } else {
+    // full forward transform and quantization
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
   }
 
-  rate_block(plane, block, plane_bsize, tx_size, args);
-  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
+  rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  rate = rate_block(plane, block, plane_bsize, tx_size, args);
+  rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
@@ -405,15 +536,17 @@
     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
                                     (rd1 > rd2 && !xd->lossless);
 
-  args->this_rate += args->rate;
-  args->this_dist += args->dist;
-  args->this_sse  += args->sse;
+  args->this_rate += rate;
+  args->this_dist += dist;
+  args->this_sse += sse;
   args->this_rd += rd;
 
   if (args->this_rd > args->best_rd) {
-    args->skip = 1;
+    args->exit_early = 1;
     return;
   }
+
+  args->skippable &= !x->plane[plane].eobs[block];
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x,
@@ -429,6 +562,7 @@
   args.x = x;
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
+  args.skippable = 1;
 
   if (plane == 0)
     xd->mi[0]->mbmi.tx_size = tx_size;
@@ -439,7 +573,7 @@
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          block_rd_txfm, &args);
-  if (args.skip) {
+  if (args.exit_early) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
@@ -448,7 +582,7 @@
     *distortion = args.this_dist;
     *rate       = args.this_rate;
     *sse        = args.this_sse;
-    *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
+    *skippable  = args.skippable;
   }
 }
 
@@ -468,7 +602,6 @@
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-  cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -476,128 +609,108 @@
                                    int64_t *distortion,
                                    int *skip,
                                    int64_t *psse,
-                                   int64_t tx_cache[TX_MODES],
                                    int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX}};
-  TX_SIZE n, m;
+  int n, m;
   int s0, s1;
-  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   int64_t best_rd = INT64_MAX;
-  TX_SIZE best_tx = TX_4X4;
+  TX_SIZE best_tx = max_tx_size;
+  int start_tx, end_tx;
 
-  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_tx_size; n++) {
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    start_tx = max_tx_size;
+    end_tx = 0;
+  } else {
+    TX_SIZE chosen_tx_size = MIN(max_tx_size,
+                                 tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  for (n = start_tx; n >= end_tx; n--) {
+    int r_tx_size = 0;
+    for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
+      if (m == n)
+        r_tx_size += vp9_cost_zero(tx_probs[m]);
+      else
+        r_tx_size += vp9_cost_one(tx_probs[m]);
+    }
     txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
                      &sse[n], ref_best_rd, 0, bs, n,
                      cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
-      for (m = 0; m <= n - (n == max_tx_size); m++) {
-        if (m == n)
-          r[n][1] += vp9_cost_zero(tx_probs[m]);
-        else
-          r[n][1] += vp9_cost_one(tx_probs[m]);
-      }
+      r[n][1] += r_tx_size;
     }
-    if (d[n] == INT64_MAX) {
+    if (d[n] == INT64_MAX || r[n][0] == INT_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
     } else if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      if (is_inter_block(mbmi)) {
+        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+      } else {
+        rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
+      }
     } else {
       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd[n][1] == INT64_MAX ||
+        (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
+        s[n] == 1))
+      break;
+
     if (rd[n][1] < best_rd) {
       best_tx = n;
       best_rd = rd[n][1];
     }
   }
-  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
-                      best_tx : MIN(max_tx_size, max_mode_tx_size);
-
+  mbmi->tx_size = best_tx;
 
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->tx_size];
   *psse       = sse[mbmi->tx_size];
-
-  tx_cache[ONLY_4X4] = rd[TX_4X4][0];
-  tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
-  tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
-  tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
-
-  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
-    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-    cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
-    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
-  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
-    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
-  } else {
-    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
-  }
 }
 
-static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                  int64_t *distortion, int *skip,
-                                  int64_t *psse, BLOCK_SIZE bs,
-                                  int64_t txfm_cache[TX_MODES],
-                                  int64_t ref_best_rd) {
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  assert(bs == xd->mi[0]->mbmi.sb_type);
-
-  vp9_subtract_plane(x, bs, 0);
-
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
-    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
-                           bs);
-  } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
-                           txfm_cache, ref_best_rd, bs);
-  }
-}
-
-static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                  int64_t *distortion, int *skip,
-                                  BLOCK_SIZE bs,
-                                  int64_t txfm_cache[TX_MODES],
-                                  int64_t ref_best_rd) {
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int64_t *distortion, int *skip,
+                            int64_t *psse, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t sse;
+  int64_t *ret_sse = psse ? psse : &sse;
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
-  if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
-    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
-                           txfm_cache, ref_best_rd, bs);
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
+                           ref_best_rd, bs);
   }
 }
 
-
 static int conditional_skipintra(PREDICTION_MODE mode,
                                  PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
@@ -619,7 +732,8 @@
   return 0;
 }
 
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int row, int col,
                                      PREDICTION_MODE *best_mode,
                                      const int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
@@ -629,29 +743,132 @@
   PREDICTION_MODE mode;
   MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                            src_stride)];
-  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                       dst_stride)];
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif
 
-  assert(ib < 4);
-
-  vpx_memcpy(ta, a, sizeof(ta));
-  vpx_memcpy(tl, l, sizeof(tl));
+  memcpy(ta, a, sizeof(ta));
+  memcpy(tl, l, sizeof(tl));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+
+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode))
+            continue;
+      }
+
+      memcpy(tempa, ta, sizeof(ta));
+      memcpy(templ, tl, sizeof(tl));
+
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+          const int block = (row + idy) * 2 + (col + idx);
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+                                                                  block,
+                                                                  p->src_diff);
+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          xd->mi[0]->bmi[block].as_mode = mode;
+          vp9_predict_intra_block(xd, 1, TX_4X4, mode,
+                                  x->skip_encode ? src : dst,
+                                  x->skip_encode ? src_stride : dst_stride,
+                                  dst, dst_stride,
+                                  col + idx, row + idy, 0);
+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+                                    dst, dst_stride, xd->bd);
+          if (xd->lossless) {
+            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            vp9_highbd_fwht4x4(src_diff, coeff, 8);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+                                   dst, dst_stride,
+                                   p->eobs[block], xd->bd);
+          } else {
+            int64_t unused;
+            const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            if (tx_type == DCT_DCT)
+              vpx_highbd_fdct4x4(src_diff, coeff, 8);
+            else
+              vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            distortion += vp9_highbd_block_error(
+                coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                16, &unused, xd->bd) >> 2;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                                  dst, dst_stride, p->eobs[block], xd->bd);
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        memcpy(a, tempa, sizeof(tempa));
+        memcpy(l, templ, sizeof(templ));
+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+          memcpy(best_dst16 + idy * 8,
+                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd:
+      {}
+    }
+    if (best_rd >= rd_thresh || x->skip_encode)
+      return best_rd;
+
+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+             best_dst16 + idy * 8,
+             num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
@@ -668,24 +885,23 @@
           continue;
     }
 
-    vpx_memcpy(tempa, ta, sizeof(ta));
-    vpx_memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, sizeof(ta));
+    memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        const int block = ib + idy * 2 + idx;
+        const int block = (row + idy) * 2 + (col + idx);
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
-                                                            p->src_diff);
-        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        int16_t *const src_diff =
+            vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0]->bmi[block].as_mode = mode;
-        vp9_predict_intra_block(xd, block, 1,
-                                TX_4X4, mode,
+        vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
-                                dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+                                dst, dst_stride, col + idx, row + idy, 0);
+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -726,11 +942,11 @@
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      vpx_memcpy(a, tempa, sizeof(tempa));
-      vpx_memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, sizeof(tempa));
+      memcpy(l, templ, sizeof(templ));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
-        vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
-                   num_4x4_blocks_wide * 4);
+        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+               num_4x4_blocks_wide * 4);
     }
   next:
     {}
@@ -740,8 +956,8 @@
     return best_rd;
 
   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
-    vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
-               num_4x4_blocks_wide * 4);
+    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+           num_4x4_blocks_wide * 4);
 
   return best_rd;
 }
@@ -753,8 +969,8 @@
   int i, j;
   const MACROBLOCKD *const xd = &mb->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
-  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -766,8 +982,8 @@
   ENTROPY_CONTEXT t_above[4], t_left[4];
   const int *bmode_costs = cpi->mbmode_cost;
 
-  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
+  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
+  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
@@ -783,9 +999,9 @@
         bmode_costs  = cpi->y_mode_costs[A][L];
       }
 
-      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + idx, t_left + idy, &r, &ry, &d,
-                                      bsize, best_rd - total_rd);
+      this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
+                                      bmode_costs, t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
@@ -813,11 +1029,11 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
+// This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize,
-                                      int64_t tx_cache[TX_MODES],
                                       int64_t best_rd) {
   PREDICTION_MODE mode;
   PREDICTION_MODE mode_selected = DC_PRED;
@@ -826,29 +1042,29 @@
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
-  int i;
-  int *bmode_costs = cpi->mbmode_cost;
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
 
-  if (cpi->sf.tx_size_search_method == USE_FULL_RD)
-    for (i = 0; i < TX_MODES; i++)
-      tx_cache[i] = INT64_MAX;
-
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_tx_cache[TX_MODES];
-    MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
-    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
-
-    if (cpi->common.frame_type == KEY_FRAME) {
-      const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
-      const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
-
-      bmode_costs = cpi->y_mode_costs[A][L];
+    if (cpi->sf.use_nonrd_pick_mode) {
+      // These speed features are turned on in hybrid non-RD and RD mode
+      // for key frame coding in the context of real-time setting.
+      if (conditional_skipintra(mode, mode_selected))
+          continue;
+      if (*skippable)
+        break;
     }
+
     mic->mbmi.mode = mode;
 
-    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, bsize, local_tx_cache, best_rd);
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -865,16 +1081,6 @@
       *distortion     = this_distortion;
       *skippable      = s;
     }
-
-    if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
-      for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
-        const int64_t adj_rd = this_rd + local_tx_cache[i] -
-            local_tx_cache[cpi->common.tx_mode];
-        if (adj_rd < tx_cache[i]) {
-          tx_cache[i] = adj_rd;
-        }
-      }
-    }
   }
 
   mic->mbmi.mode = mode_selected;
@@ -883,21 +1089,24 @@
   return best_rd;
 }
 
-static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
-                             int *rate, int64_t *distortion, int *skippable,
-                             int64_t *sse, BLOCK_SIZE bsize,
-                             int64_t ref_best_rd) {
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
   int plane;
   int pnrate = 0, pnskip = 1;
   int64_t pndist = 0, pnsse = 0;
+  int is_cost_valid = 1;
 
   if (ref_best_rd < 0)
-    goto term;
+    is_cost_valid = 0;
 
-  if (is_inter_block(mbmi)) {
+  if (is_inter_block(mbmi) && is_cost_valid) {
     int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
@@ -912,21 +1121,25 @@
     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_tx_size,
                      cpi->sf.use_fast_coef_costing);
-    if (pnrate == INT_MAX)
-      goto term;
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
     *rate += pnrate;
     *distortion += pndist;
     *sse += pnsse;
     *skippable &= pnskip;
   }
-  return;
 
-  term:
-  *rate = INT_MAX;
-  *distortion = INT64_MAX;
-  *sse = INT64_MAX;
-  *skippable = 0;
-  return;
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -941,15 +1154,15 @@
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
     xd->mi[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                     &this_distortion, &s, &this_sse, bsize, best_rd);
-    if (this_rate_tokenonly == INT_MAX)
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
@@ -979,19 +1192,19 @@
   int64_t unused;
 
   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx,
                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  PREDICTION_MODE *mode_uv) {
-  MACROBLOCK *const x = &cpi->mb;
-
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
@@ -1013,14 +1226,8 @@
   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
 }
 
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize,
-                                int_mv *frame_mv,
-                                int mi_row, int mi_col,
-                                int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv);
-
-static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
+static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                int i,
                                 PREDICTION_MODE mode, int_mv this_mv[2],
                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                                 int_mv seg_mvs[MAX_REF_FRAMES],
@@ -1028,6 +1235,7 @@
                                 int *mvcost[2]) {
   MODE_INFO *const mic = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mic->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   int thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -1068,10 +1276,9 @@
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
+  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mbmi->ref_frame[0]]) +
             thismvcost;
 }
 
@@ -1094,19 +1301,29 @@
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
 
-  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
-                                                             p->src.stride)];
-  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
-                                                        pd->dst.stride)];
+  const uint8_t *const src =
+      &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                            pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+                                     dst, pd->dst.stride,
+                                     &mi->bmi[i].as_mv[ref].as_mv,
+                                     &xd->block_refs[ref]->sf, width, height,
+                                     ref, kernel, MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i % 2),
+                                     mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
+  } else {
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
@@ -1115,25 +1332,57 @@
                               mi_col * MI_SIZE + 4 * (i % 2),
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
+#else
+    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 
-  vp9_subtract_block(height, width,
-                     raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-                     src, p->src.stride,
-                     dst, pd->dst.stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    vpx_subtract_block(
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
+  vpx_subtract_block(height, width,
+                     vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
-      int16_t* coeff;
+      tran_low_t* coeff;
 
       k += (idy * 2 + idx);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        thisdistortion += vp9_highbd_block_error(coeff,
+                                                 BLOCK_OFFSET(pd->dqcoeff, k),
+                                                 16, &ssz, xd->bd);
+      } else {
+        thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                          16, &ssz);
+      }
+#else
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       thissse += ssz;
       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
                               so->scan, so->neighbors,
@@ -1191,13 +1440,14 @@
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                   p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
-                                                       pd->pre[0].stride)];
+  pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
-                                                         pd->pre[1].stride)];
+    pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
@@ -1217,11 +1467,9 @@
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
-    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int inter_mode_mask, int this_mode,
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if ((inter_mode_mask & (1 << ZEROMV)) &&
-      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
@@ -1252,8 +1500,191 @@
   return 1;
 }
 
+static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize,
+                                int_mv *frame_mv,
+                                int mi_row, int mi_col,
+                                int_mv single_newmv[MAX_REF_FRAMES],
+                                int *rate_mv) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int refs[2] = {mbmi->ref_frame[0],
+                       mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+  int_mv ref_mv[2];
+  int ite, ref;
+  const InterpKernel *kernel = vp9_filter_kernels[mbmi->interp_filter];
+  struct scale_factors sf;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
+
+  // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
+
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+  }
+
+  // Since we have scaled the reference frames to match the size of the current
+  // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    MV tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+      vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
+                                       ref_yv12[!id].stride,
+                                       second_pred, pw,
+                                       &frame_mv[refs[!id]].as_mv,
+                                       &sf, pw, ph, 0,
+                                       kernel, MV_PRECISION_Q3,
+                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                       xd->bd);
+    } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
+      vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &sf, pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]].as_mv,
+                              &sf, pw, ph, 0,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    // Do compound motion search on the current reference frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    tmp_mv = frame_mv[refs[id]].as_mv;
+
+    tmp_mv.col >>= 3;
+    tmp_mv.row >>= 3;
+
+    // Small-range full-pixel motion search.
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[bsize],
+                                       &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[0];
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_mv = tmp_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+}
+
 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                        const TileInfo * const tile,
                                         int_mv *best_ref_mv,
                                         int_mv *second_best_ref_mv,
                                         int64_t best_rd, int *returntotrate,
@@ -1287,6 +1718,7 @@
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
 
   vp9_zero(*bsi);
 
@@ -1299,8 +1731,8 @@
   for (i = 0; i < 4; i++)
     bsi->modes[i] = ZEROMV;
 
-  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
-  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
+  memcpy(t_above, pd->above_context, sizeof(t_above));
+  memcpy(t_left, pd->left_context, sizeof(t_left));
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1323,9 +1755,10 @@
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
-        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame]);
+                                      &frame_mv[NEARMV][frame],
+                                      mbmi_ext->mode_context);
       }
 
       // search for the best motion vector on this segment
@@ -1338,16 +1771,15 @@
         if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
-        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                inter_mode_mask,
+        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
                                 this_mode, mbmi->ref_frame))
           continue;
 
-        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-                   sizeof(bsi->rdstat[i][mode_idx].ta));
-        vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-                   sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[i][mode_idx].ta));
+        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[i][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
@@ -1358,13 +1790,14 @@
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
+          int cost_list[5];
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh)
             break;
 
-          if (!is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1390,7 +1823,7 @@
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+          if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
             step_param = MAX(step_param, 8);
@@ -1401,12 +1834,14 @@
 
           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
-          bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                          sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
-                                          INT_MAX, 1);
+          bestsme = vp9_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              &bsi->ref_mv[0]->as_mv, new_mv,
+              INT_MAX, 1);
 
           // Should we do a full search (best quality only)
-          if (is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode == BEST) {
             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
@@ -1415,6 +1850,7 @@
                                            sadpb, 16, &cpi->fn_ptr[bsize],
                                            &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
+            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
             if (thissme < bestsme) {
               bestsme = thissme;
               *new_mv = best_mv->as_mv;
@@ -1427,17 +1863,19 @@
 
           if (bestsme < INT_MAX) {
             int distortion;
-            cpi->find_fractional_mv_step(x,
-                                         new_mv,
-                                         &bsi->ref_mv[0]->as_mv,
-                                         cm->allow_high_precision_mv,
-                                         x->errorperbit, &cpi->fn_ptr[bsize],
-                                         cpi->sf.mv.subpel_force_stop,
-                                         cpi->sf.mv.subpel_iters_per_step,
-                                         x->nmvjointcost, x->mvcost,
-                                         &distortion,
-                                         &x->pred_sse[mbmi->ref_frame[0]],
-                                         NULL, 0, 0);
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, 0, 0);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1475,7 +1913,7 @@
         }
 
         bsi->rdstat[i][mode_idx].brate =
-            set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
+            set_and_cost_bmi_mvs(cpi, x, xd, i, this_mode, mode_mv[this_mode],
                                  frame_mv, seg_mvs[i], bsi->ref_mv,
                                  x->nmvjointcost, x->mvcost);
 
@@ -1517,8 +1955,8 @@
 
           if (!subpelmv && have_ref &&
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                       sizeof(SEG_RDSTAT));
+            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+                   sizeof(SEG_RDSTAT));
             if (num_4x4_blocks_wide > 1)
               bsi->rdstat[i + 1][mode_idx].eobs =
                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
@@ -1566,14 +2004,14 @@
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return INT64_MAX;;
+        return INT64_MAX;
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
-      set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
+      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
                            x->mvcost);
 
@@ -1589,7 +2027,7 @@
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return INT64_MAX;;
+        return INT64_MAX;
       }
     }
   } /* for each label */
@@ -1634,16 +2072,16 @@
                                      int segment_id,
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
-                                     vp9_prob *comp_mode_p) {
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+                                     vpx_prob *comp_mode_p) {
+  int seg_ref_active = segfeature_active(&cm->seg, segment_id,
+                                         SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
-    vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
-    vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+    memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
-    vp9_prob comp_inter_p = 128;
+    vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+    vpx_prob comp_inter_p = 128;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
@@ -1655,8 +2093,8 @@
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
-      vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
-      vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+      vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+      vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -1675,7 +2113,7 @@
       ref_costs_single[ALTREF_FRAME] = 512;
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
-      vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
+      vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -1693,26 +2131,26 @@
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int64_t comp_pred_diff[REFERENCE_MODES],
-                         const int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+                         int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
   ctx->skip = x->skip;
+  ctx->skippable = skippable;
   ctx->best_mode_index = mode_index;
   ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 
-  vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
-  vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
-             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+  memcpy(ctx->best_filter_diff, best_filter_diff,
+         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
                                MV_REFERENCE_FRAME ref_frame,
                                BLOCK_SIZE block_size,
                                int mi_row, int mi_col,
@@ -1723,15 +2161,19 @@
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
-  int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
+  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  assert(yv12 != NULL);
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -1759,19 +2201,20 @@
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
-  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
+  int cost_list[5];
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
 
   MV pred_mv[3];
-  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
-  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
   pred_mv[2] = x->pred_mv[ref];
 
   if (scaled_ref_frame) {
@@ -1799,34 +2242,36 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
-      cm->show_frame) {
-    int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
-                                                       b_width_log2(bsize)));
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+    int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
+          MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = MAX(step_param, boffset);
   }
 
   if (cpi->sf.adaptive_motion_search) {
-    int bwl = b_width_log2(bsize);
-    int bhl = b_height_log2(bsize);
-    int i;
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
     if (tlevel < 5)
       step_param += 2;
 
-    for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-      if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-        x->pred_mv[ref].row = 0;
-        x->pred_mv[ref].col = 0;
-        tmp_mv->as_int = INVALID_MV;
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
 
-        if (scaled_ref_frame) {
-          int i;
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            xd->plane[i].pre[0] = backup_yv12[i];
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[0] = backup_yv12[i];
+          }
+          return;
         }
-        return;
       }
     }
   }
@@ -1837,6 +2282,7 @@
   mvp_full.row >>= 3;
 
   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_cost_list(cpi, cost_list),
                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
@@ -1852,13 +2298,14 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+  if (cpi->sf.adaptive_motion_search)
     x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
@@ -1868,147 +2315,7 @@
   }
 }
 
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize,
-                                int_mv *frame_mv,
-                                int mi_row, int mi_col,
-                                int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
-  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
-  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int refs[2] = { mbmi->ref_frame[0],
-                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
-  int_mv ref_mv[2];
-  int ite, ref;
-  // Prediction buffer from second frame.
-  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
-  // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
-  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
-  int last_besterr[2] = {INT_MAX, INT_MAX};
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
-  };
-
-  for (ref = 0; ref < 2; ++ref) {
-    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
-
-    if (scaled_ref_frame[ref]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[ref][i] = xd->plane[i].pre[ref];
-      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                           NULL);
-    }
-
-    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
-  }
-
-  // Allow joint search multiple times iteratively for each ref frame
-  // and break out the search loop if it couldn't find better mv.
-  for (ite = 0; ite < 4; ite++) {
-    struct buf_2d ref_yv12[2];
-    int bestsme = INT_MAX;
-    int sadpb = x->sadperbit16;
-    MV tmp_mv;
-    int search_range = 3;
-
-    int tmp_col_min = x->mv_col_min;
-    int tmp_col_max = x->mv_col_max;
-    int tmp_row_min = x->mv_row_min;
-    int tmp_row_max = x->mv_row_max;
-    int id = ite % 2;
-
-    // Initialized here because of compiler problem in Visual Studio.
-    ref_yv12[0] = xd->plane[0].pre[0];
-    ref_yv12[1] = xd->plane[0].pre[1];
-
-    // Get pred block from second frame.
-    vp9_build_inter_predictor(ref_yv12[!id].buf,
-                              ref_yv12[!id].stride,
-                              second_pred, pw,
-                              &frame_mv[refs[!id]].as_mv,
-                              &xd->block_refs[!id]->sf,
-                              pw, ph, 0,
-                              kernel, MV_PRECISION_Q3,
-                              mi_col * MI_SIZE, mi_row * MI_SIZE);
-
-    // Compound motion search on first ref frame.
-    if (id)
-      xd->plane[0].pre[0] = ref_yv12[id];
-    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
-
-    // Use mv result from single mode as mvp.
-    tmp_mv = frame_mv[refs[id]].as_mv;
-
-    tmp_mv.col >>= 3;
-    tmp_mv.row >>= 3;
-
-    // Small-range full-pixel motion search
-    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                       search_range,
-                                       &cpi->fn_ptr[bsize],
-                                       &ref_mv[id].as_mv, second_pred);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
-                                      second_pred, &cpi->fn_ptr[bsize], 1);
-
-    x->mv_col_min = tmp_col_min;
-    x->mv_col_max = tmp_col_max;
-    x->mv_row_min = tmp_row_min;
-    x->mv_row_max = tmp_row_max;
-
-    if (bestsme < INT_MAX) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &tmp_mv,
-          &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv,
-          x->errorperbit,
-          &cpi->fn_ptr[bsize],
-          0, cpi->sf.mv.subpel_iters_per_step,
-          x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-          pw, ph);
-    }
-
-    if (id)
-      xd->plane[0].pre[0] = scaled_first_yv12;
-
-    if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = tmp_mv;
-      last_besterr[id] = bestsme;
-    } else {
-      break;
-    }
-  }
-
-  *rate_mv = 0;
-
-  for (ref = 0; ref < 2; ++ref) {
-    if (scaled_ref_frame[ref]) {
-      // restore the predictor
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[ref] = backup_yv12[ref][i];
-    }
-
-    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
-                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-  }
-
-  vpx_free(second_pred);
-}
 
 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
                                    uint8_t *orig_dst[MAX_MB_PLANE],
@@ -2020,103 +2327,46 @@
   }
 }
 
-static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int *rate2,
-                                    int64_t *distortion, int64_t *distortion_uv,
-                                    int *disable_skip) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-  unsigned int var, sse;
-  // Skipping threshold for ac.
-  unsigned int thresh_ac;
-  // Skipping threshold for dc
-  unsigned int thresh_dc;
-
-  var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                               xd->plane[0].dst.buf,
-                               xd->plane[0].dst.stride, &sse);
-
-  if (x->encode_breakout > 0) {
-    // Set a maximum for threshold to avoid big PSNR loss in low bitrate
-    // case. Use extreme low threshold for static frames to limit skipping.
-    const unsigned int max_thresh = (cpi->allow_encode_breakout ==
-                                     ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
-    // The encode_breakout input
-    const unsigned int min_thresh =
-        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
-    // Calculate threshold according to dequant value.
-    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
-    // Adjust threshold according to partition size.
-    thresh_ac >>= 8 - (b_width_log2(bsize) +
-        b_height_log2(bsize));
-    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
-  } else {
-    thresh_ac = 0;
-    thresh_dc = 0;
-  }
-
-  // Y skipping condition checking
-  if (sse < thresh_ac || sse == 0) {
-    // dc skipping checking
-    if ((sse - var) < thresh_dc || sse == var) {
-      unsigned int sse_u, sse_v;
-      unsigned int var_u, var_v;
-
-      var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                      x->plane[1].src.stride,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride, &sse_u);
-
-      // U skipping condition checking
-      if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
-          (sse_u - var_u < thresh_dc || sse_u == var_u)) {
-        var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                        x->plane[2].src.stride,
-                                        xd->plane[2].dst.buf,
-                                        xd->plane[2].dst.stride, &sse_v);
-
-        // V skipping condition checking
-        if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
-            (sse_v - var_v < thresh_dc || sse_v == var_v)) {
-          x->skip = 1;
-
-          // The cost of skip bit needs to be added.
-          *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
-          // Scaling factor for SSE from spatial domain to frequency domain
-          // is 16. Adjust distortion accordingly.
-          *distortion_uv = (sse_u + sse_v) << 4;
-          *distortion = (sse << 4) + *distortion_uv;
-
-          *disable_skip = 1;
-        }
-      }
-    }
-  }
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+                               int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref &&
+          (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
-                                 int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
                                  int *skippable,
-                                 int *rate_y, int64_t *distortion_y,
-                                 int *rate_uv, int64_t *distortion_uv,
+                                 int *rate_y, int *rate_uv,
                                  int *disable_skip,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
+                                 INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+                                 int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
-                                 const int64_t ref_best_rd) {
+                                 const int64_t ref_best_rd,
+                                 int64_t *mask_filter,
+                                 int64_t filter_cache[]) {
   VP9_COMMON *cm = &cpi->common;
-  RD_OPT *rd_opt = &cpi->rd;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
@@ -2124,24 +2374,40 @@
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
-  int64_t this_rd = 0;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
+  uint8_t *tmp_buf;
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
-  int skip_txfm[MAX_MB_PLANE] = {0};
-  int64_t bsse[MAX_MB_PLANE] = {0};
+  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+  int64_t bsse[MAX_MB_PLANE << 2] = {0};
 
   int bsl = mi_width_log2_lookup[bsize];
   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
 
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int64_t distortion_y = 0, distortion_uv = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+  } else {
+    tmp_buf = (uint8_t *)tmp_buf16;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
     if (xd->up_available)
@@ -2157,6 +2423,12 @@
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
+
+    if (cpi->sf.adaptive_mode_search) {
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+    }
   }
 
   if (this_mode == NEWMV) {
@@ -2171,10 +2443,10 @@
                             mi_row, mi_col, single_newmv, &rate_mv);
       } else {
         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                   &mbmi->ref_mvs[refs[0]][0].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &mbmi->ref_mvs[refs[1]][0].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
       *rate2 += rate_mv;
@@ -2184,10 +2456,20 @@
                            &tmp_mv, &rate_mv);
       if (tmp_mv.as_int == INVALID_MV)
         return INT64_MAX;
-      *rate2 += rate_mv;
+
       frame_mv[refs[0]].as_int =
           xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+      // Estimate the rate implications of a new mv but discount this
+      // under certain circumstances where we want to help initiate a weak
+      // motion field, where the distortion gain for a single block may not
+      // be enough to overcome the cost of a new mv.
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+      } else {
+        *rate2 += rate_mv;
+      }
     }
   }
 
@@ -2212,11 +2494,25 @@
     orig_dst_stride[i] = xd->plane[i].dst.stride;
   }
 
-  /* We don't include the cost of the second reference here, because there
-   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-   * words if you present them in that order, the second one is always known
-   * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+  // We don't include the cost of the second reference here, because there
+  // are only two options: Last/ARF or Golden/ARF; The second one is always
+  // known, which is ARF.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+                          mode_mv, refs[0])) {
+    *rate2 += MIN(cost_mv_ref(cpi, this_mode,
+                              mbmi_ext->mode_context[refs[0]]),
+                  cost_mv_ref(cpi, NEARESTMV,
+                              mbmi_ext->mode_context[refs[0]]));
+  } else {
+    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+  }
+
+  if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV)
+    return INT64_MAX;
 
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
@@ -2226,9 +2522,8 @@
 
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  rd_opt->mask_filter = 0;
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    rd_opt->filter_cache[i] = INT64_MAX;
+    filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
@@ -2241,21 +2536,31 @@
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         int j;
         int64_t rs_rd;
+        int tmp_skip_sb = 0;
+        int64_t tmp_skip_sse = INT64_MAX;
+
         mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
         if (i > 0 && intpel_mv) {
           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-          rd_opt->filter_cache[i] = rd;
-          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+          *mask_filter = MAX(*mask_filter, rd);
         } else {
           int rate_sum = 0;
           int64_t dist_sum = 0;
+          if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+              (cpi->sf.interp_filter_search_mask & (1 << i))) {
+            rate_sum = INT_MAX;
+            dist_sum = INT64_MAX;
+            continue;
+          }
+
           if ((cm->interp_filter == SWITCHABLE &&
                (!i || best_needs_copy)) ||
               (cm->interp_filter != SWITCHABLE &&
@@ -2269,15 +2574,16 @@
             }
           }
           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
+                          &tmp_skip_sb, &tmp_skip_sse);
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-          rd_opt->filter_cache[i] = rd;
-          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+          *mask_filter = MAX(*mask_filter, rd);
 
           if (i == 0 && intpel_mv) {
             tmp_rate_sum = rate_sum;
@@ -2298,14 +2604,18 @@
           best_filter = mbmi->interp_filter;
           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
-          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
         }
 
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mbmi->interp_filter)) {
           pred_exists = 1;
+          tmp_rd = best_rd;
+
+          skip_txfm_sb = tmp_skip_sb;
+          skip_sse_sb = tmp_skip_sse;
+          memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          memcpy(bsse, x->bsse, sizeof(bsse));
         }
       }
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2314,7 +2624,7 @@
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : best_filter;
-  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2324,17 +2634,31 @@
         xd->plane[i].dst.stride = 64;
       }
     }
+    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
   } else {
-    // Handles the special case when a filter that is not in the
-    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-  }
-
-  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     int tmp_rate;
     int64_t tmp_dist;
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+    // Handles the special case when a filter that is not in the
+    // switchable list (ex. bilinear) is indicated at the frame level, or
+    // skip condition holds.
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
+                    &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    memcpy(bsse, x->bsse, sizeof(bsse));
+  }
+
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+  if (cpi->sf.adaptive_mode_search)
+    if (is_comp_pred)
+      if (single_skippable[this_mode][refs[0]] &&
+          single_skippable[this_mode][refs[1]])
+        memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm));
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
@@ -2344,25 +2668,20 @@
   }
 
   if (cm->interp_filter == SWITCHABLE)
-    *rate2 += vp9_get_switchable_rate(cpi);
+    *rate2 += rs;
 
-  if (!is_comp_pred) {
-    if (cpi->allow_encode_breakout)
-      rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv,
-                              disable_skip);
-  }
+  memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  memcpy(x->bsse, bsse, sizeof(bsse));
 
-  vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
-  vpx_memcpy(x->bsse, bsse, sizeof(bsse));
-
-  if (!x->skip) {
+  if (!skip_txfm_sb) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                          bsize, txfm_cache, ref_best_rd);
+    vp9_subtract_plane(x, bsize, 0);
+    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                    bsize, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2372,14 +2691,13 @@
     }
 
     *rate2 += *rate_y;
-    *distortion += *distortion_y;
+    *distortion += distortion_y;
 
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
-                     bsize, ref_best_rd - rdcosty);
-    if (*rate_uv == INT_MAX) {
+    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                          &sseuv, bsize, ref_best_rd - rdcosty)) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2388,177 +2706,296 @@
 
     *psse += sseuv;
     *rate2 += *rate_uv;
-    *distortion += *distortion_uv;
+    *distortion += distortion_uv;
     *skippable = skippable_y && skippable_uv;
+  } else {
+    x->skip = 1;
+    *disable_skip = 1;
+
+    // The cost of skip bit needs to be added.
+    *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+    *distortion = skip_sse_sb;
   }
 
+  if (!is_comp_pred)
+    single_skippable[this_mode][refs[0]] = *skippable;
+
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
-  return this_rd;  // if 0, this will be re-calculated by caller
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate, int64_t *returndist,
-                               BLOCK_SIZE bsize,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = xd->plane;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
-  int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
+  int64_t dist_y = 0, dist_uv = 0;
   TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                               &dist_y, &y_skip, bsize, tx_cache,
+                               &dist_y, &y_skip, bsize,
                                best_rd) >= best_rd) {
-      *returnrate = INT_MAX;
+      rd_cost->rate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
-                                         pd[1].subsampling_x,
-                                         pd[1].subsampling_y);
-    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   } else {
     y_skip = 0;
     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                      &dist_y, best_rd) >= best_rd) {
-      *returnrate = INT_MAX;
+      rd_cost->rate = INT_MAX;
       return;
     }
-    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
-                                         pd[1].subsampling_x,
-                                         pd[1].subsampling_y);
-    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   }
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
+                                       pd[1].subsampling_x,
+                                       pd[1].subsampling_y);
+  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                          &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
+                          max_uv_tx_size);
 
   if (y_skip && uv_skip) {
-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-    *returndist = dist_y + dist_uv;
-    vp9_zero(ctx->tx_rd_diff);
+    rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                    vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    rd_cost->dist = dist_y + dist_uv;
   } else {
-    int i;
-    *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
-    *returndist = dist_y + dist_uv;
-    if (cpi->sf.tx_size_search_method == USE_FULL_RD)
-      for (i = 0; i < TX_MODES; i++) {
-        if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
-          ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
-        else
-          ctx->tx_rd_diff[i] = 0;
-      }
+    rd_cost->rate = rate_y + rate_uv +
+                      vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+    rd_cost->dist = dist_y + dist_uv;
   }
 
   ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
-// Updating rd_thresh_freq_fact[] here means that the different
-// partition/block sizes are handled independently based on the best
-// choice for the current partition. It may well be better to keep a scaled
-// best rd so far value and update rd_thresh_freq_fact based on the mode/size
-// combination that wins out.
-static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
-                                  int best_mode_index) {
-  if (cpi->sf.adaptive_rd_thresh > 0) {
-    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
-    int mode;
-    for (mode = 0; mode < top_mode; ++mode) {
-      int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 16
+#define VLOW_ADJ_MAX 25
+#define VHIGH_ADJ_MAX 8
+static void rd_variance_adjustment(VP9_COMP *cpi,
+                                   MACROBLOCK *x,
+                                   BLOCK_SIZE bsize,
+                                   int64_t *this_rd,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   unsigned int source_variance) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int recon_variance;
+  unsigned int absvar_diff = 0;
+  int64_t var_error = 0;
+  int64_t var_factor = 0;
 
-      if (mode == best_mode_index) {
-        *fact -= (*fact >> 3);
-      } else {
-        *fact = MIN(*fact + RD_THRESH_INC,
-                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
-      }
-    }
+  if (*this_rd == INT64_MAX)
+    return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon_variance =
+      vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    recon_variance =
+      vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
   }
+#else
+  recon_variance =
+    vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
+    absvar_diff = (source_variance > recon_variance)
+      ? (source_variance - recon_variance)
+      : (recon_variance - source_variance);
+
+    var_error = (200 * source_variance * recon_variance) /
+      ((source_variance * source_variance) +
+       (recon_variance * recon_variance));
+    var_error = 100 - var_error;
+  }
+
+  // Source variance above a threshold and ref frame is intra.
+  // This case is targeted mainly at discouraging intra modes that give rise
+  // to a predictor with a low spatial complexity compared to the source.
+  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
+      (source_variance > recon_variance)) {
+    var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
+  // A second possible case of interest is where the source variance
+  // is very low and we wish to discourage false texture or motion trails.
+  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
+             (recon_variance > source_variance)) {
+    var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
+  }
+  *this_rd += (*this_rd * var_factor) / 100;
 }
 
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                  const TileInfo *const tile,
-                                  int mi_row, int mi_col,
-                                  int *returnrate,
-                                  int64_t *returndistortion,
-                                  BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far) {
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = MAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = MAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+                       int mi_row, int mi_col) {
+  return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+         vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
+                               TileDataEnc *tile_data,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  struct macroblockd_plane *const pd = xd->plane;
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i;
+  int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+  INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
-  int64_t best_tx_rd[TX_MODES];
-  int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
-  int mode_index, best_mode_index = -1;
+  int best_mode_skippable = 0;
+  int midx, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
-  int64_t best_inter_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
-  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
-  const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
-  const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int best_skip2 = 0;
-  int mode_skip_mask = 0;
-  int mode_skip_start = cpi->sf.mode_skip_start + 1;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+  int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
-  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
-  const int intra_y_mode_mask =
-      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  int64_t mode_threshold[MAX_MODES];
+  int *mode_map = tile_data->mode_map[bsize];
+  const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
   vp9_zero(best_mbmode);
-  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_MODES; i++)
-    best_tx_rd[i] = INT64_MAX;
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
     x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+      single_inter_filter[i][k] = SWITCHABLE;
+      single_skippable[i][k] = 0;
+    }
+  }
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -2566,74 +3003,98 @@
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    // All modes from vp9_mode_order that use this frame as any ref
-    static const int ref_frame_mask_all[] = {
-        0x0, 0x123291, 0x25c444, 0x39b722
-    };
-    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
-    // this frame as their primary ref
-    static const int ref_frame_mask_fixedmv[] = {
-        0x0, 0x121281, 0x24c404, 0x080102
-    };
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-      // Skip modes for missing references
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
-    } else if (cpi->sf.reference_masking) {
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    } else if (sf->reference_masking) {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
           break;
         }
       }
     }
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
   }
 
   // Disable this drop out case if the ref frame
   // segment level feature is enabled for this segment. This is to
   // prevent the possibility that we end up unable to pick any mode.
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      mode_skip_mask =
-          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARESTA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
     }
   }
 
-  // TODO(JBB): This is to make up for the fact that we don't have sad
-  // functions that work when the block size reads outside the umv.  We
-  // should fix this either by making the motion search just work on
-  // a representative block in the boundary ( first ) and then implement a
-  // function that does sads when inside the border..
-  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
-    const int new_modes_mask =
-        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
-        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
-    mode_skip_mask |= new_modes_mask;
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
   }
 
-  if (bsize > cpi->sf.max_intra_bsize) {
-    const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
-        (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
-        (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
-        (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
-    mode_skip_mask |= all_intra_modes;
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+  midx =  sf->schedule_mode_search ? mode_skip_start : 0;
+  while (midx > 4) {
+    uint8_t end_pos = 0;
+    for (i = 5; i < midx; ++i) {
+      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
+        uint8_t tmp = mode_map[i];
+        mode_map[i] = mode_map[i - 1];
+        mode_map[i - 1] = tmp;
+        end_pos = i;
+      }
+    }
+    midx = end_pos;
+  }
+
+  for (midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
@@ -2641,26 +3102,30 @@
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int64_t tx_cache[TX_MODES];
-    int i;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (mode_index == mode_skip_start && best_mode_index >= 0) {
-      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+    if (midx == mode_skip_start && best_mode_index >= 0) {
+      switch (best_mbmode.ref_frame[0]) {
         case INTRA_FRAME:
           break;
         case LAST_FRAME:
-          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case GOLDEN_FRAME:
-          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case ALTREF_FRAME:
-          mode_skip_mask |= ALT_REF_MODE_MASK;
+          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
           break;
         case NONE:
         case MAX_REF_FRAMES:
@@ -2668,26 +3133,27 @@
           break;
       }
     }
-    if (mode_skip_mask & (1 << mode_index))
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+      continue;
+
+    if (mode_skip_mask[ref_frame] & (1 << this_mode))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
-                            rd_thresh_freq_fact[mode_index]))
+    if (best_mode_skippable && sf->schedule_mode_search)
+      mode_threshold[mode_index] <<= 1;
+
+    if (best_rd < mode_threshold[mode_index])
       continue;
 
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
-      continue;
-    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
-
-    if (cpi->sf.motion_field_mode_search) {
+    if (sf->motion_field_mode_search) {
       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
-                                tile->mi_col_end - mi_col);
+                                tile_info->mi_col_end - mi_col);
       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
-                                tile->mi_row_end - mi_row);
-      const int bsl = mi_width_log2(bsize);
+                                tile_info->mi_row_end - mi_row);
+      const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
       MB_MODE_INFO *ref_mbmi;
@@ -2697,7 +3163,7 @@
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
-      if ((mi_row - 1) >= tile->mi_row_start) {
+      if ((mi_row - 1) >= tile_info->mi_row_start) {
         ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
         rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
         for (i = 0; i < mi_width; ++i) {
@@ -2708,7 +3174,7 @@
         }
       }
 
-      if ((mi_col - 1) >= tile->mi_col_start) {
+      if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV)
           ref_mv = xd->mi[-1]->mbmi.mv[0];
         if (rf == NONE)
@@ -2733,14 +3199,22 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
+
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          best_mode_index >=0 &&
-          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
+          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
-      if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
-          ref_frame != best_inter_ref_frame &&
-          second_ref_frame != best_inter_ref_frame)
-        continue;
+
       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
     } else {
       if (ref_frame != INTRA_FRAME)
@@ -2748,8 +3222,10 @@
     }
 
     if (ref_frame == INTRA_FRAME) {
-      if (!(intra_y_mode_mask & (1 << this_mode)))
-        continue;
+      if (sf->adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+          continue;
+
       if (this_mode != DC_PRED) {
         // Disable intra modes other than DC_PRED for blocks with low variance
         // Threshold for intra skipping based on source variance
@@ -2763,7 +3239,7 @@
         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
           if (best_mode_index >= 0 &&
-              vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+              best_mbmode.ref_frame[0] > INTRA_FRAME)
             continue;
         }
         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -2773,8 +3249,8 @@
       }
     } else {
       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
-      if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                              inter_mode_mask, this_mode, ref_frames))
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+                              this_mode, ref_frames))
         continue;
     }
 
@@ -2786,6 +3262,8 @@
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -2796,21 +3274,19 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-    for (i = 0; i < TX_MODES; ++i)
-      tx_cache[i] = INT64_MAX;
-
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
-                            bsize, tx_cache, best_rd);
-
+      struct macroblockd_plane *const pd = &xd->plane[1];
+      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      NULL, bsize, best_rd);
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x,
-                                  pd[1].subsampling_y);
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
+                                  pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
@@ -2826,13 +3302,13 @@
       distortion2 = distortion_y + distortion_uv;
     } else {
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  tx_cache,
                                   &rate2, &distortion2, &skippable,
-                                  &rate_y, &distortion_y,
-                                  &rate_uv, &distortion_uv,
+                                  &rate_y, &rate_uv,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
-                                  single_newmv, &total_sse, best_rd);
+                                  single_newmv, single_inter_filter,
+                                  single_skippable, &total_sse, best_rd,
+                                  &mask_filter, filter_cache);
       if (this_rd == INT64_MAX)
         continue;
 
@@ -2852,18 +3328,11 @@
 
     if (!disable_skip) {
       if (skippable) {
-        vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
-
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-        // for best yrd calculation
-        rate_uv = 0;
 
         // Cost the skip mb case
-        if (skip_prob) {
-          int prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-          rate2 += prob_skip_cost;
-        }
+        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
@@ -2875,8 +3344,6 @@
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
-          rate_y = 0;
-          rate_uv = 0;
           this_skip2 = 1;
         }
       } else {
@@ -2888,18 +3355,17 @@
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
+    // Apply an adjustment to the rd value based on the similarity of the
+    // source variance and reconstructed variance.
+    rd_variance_adjustment(cpi, x, bsize, &this_rd,
+                           ref_frame, x->source_variance);
+
     if (ref_frame == INTRA_FRAME) {
     // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
-    } else {
-      // Keep record of best inter rd with single reference
-      if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
-        best_inter_rd = this_rd;
-        best_inter_ref_frame = ref_frame;
-      }
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -2909,12 +3375,6 @@
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       int max_plane = MAX_MB_PLANE;
@@ -2926,25 +3386,35 @@
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
           max_plane = 1;
+        } else {
+          best_pred_sse = x->pred_sse[ref_frame];
         }
 
-        *returnrate = rate2;
-        *returndistortion = distortion2;
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        best_mode_skippable = skippable;
+
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
-        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(uint8_t) * ctx->num_4x4_blk);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
@@ -2973,34 +3443,32 @@
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
       if (!comp_pred) {
-        if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
           best_pred_rd[SINGLE_REFERENCE] = single_rd;
-        }
       } else {
-        if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
-        }
       }
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
 
       /* keep record of best filter type */
       if (!mode_excluded && cm->interp_filter != BILINEAR) {
-        int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->interp_filter];
 
         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
           int64_t adj_rd;
           if (ref == INT64_MAX)
             adj_rd = 0;
-          else if (rd_opt->filter_cache[i] == INT64_MAX)
+          else if (filter_cache[i] == INT64_MAX)
             // when early termination is triggered, the encoder does not have
             // access to the rate-distortion cost. it only knows that the cost
             // should be above the maximum valid value. hence it takes the known
             // maximum plus an arbitrary constant as the rate-distortion cost.
-            adj_rd = rd_opt->mask_filter - ref + 10;
+            adj_rd = mask_filter - ref + 10;
           else
-            adj_rd = rd_opt->filter_cache[i] - ref;
+            adj_rd = filter_cache[i] - ref;
 
           adj_rd += this_rd;
           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -3008,23 +3476,6 @@
       }
     }
 
-    /* keep record of best txfm size */
-    if (bsize < BLOCK_32X32) {
-      if (bsize < BLOCK_16X16)
-        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
-
-      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
-    }
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
-        int64_t adj_rd = INT64_MAX;
-        adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
-
-        if (adj_rd < best_tx_rd[i])
-          best_tx_rd[i] = adj_rd;
-      }
-    }
-
     if (early_term)
       break;
 
@@ -3032,13 +3483,38 @@
       break;
   }
 
-  if (best_mode_index < 0 || best_rd >= best_rd_so_far)
-    return INT64_MAX;
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV) {
+    const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+        best_mbmode.ref_frame[1]};
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARESTMV;
+    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARMV;
+    else if (best_mbmode.mv[0].as_int == 0 &&
+        ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+      best_mbmode.mode = ZEROMV;
+  }
+
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   // If we used an estimate for the uv intra rd in the loop above...
-  if (cpi->sf.use_uv_intra_rd_estimate) {
+  if (sf->use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       TX_SIZE uv_tx_size;
       *mbmi = best_mbmode;
       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
@@ -3055,7 +3531,9 @@
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  update_rd_thresh_fact(cpi, bsize, best_mode_index);
+  if (!cpi->rc.is_src_frame_alt_ref)
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -3077,42 +3555,55 @@
     }
     if (cm->interp_filter == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-    for (i = 0; i < TX_MODES; i++) {
-      if (best_tx_rd[i] == INT64_MAX)
-        best_tx_diff[i] = 0;
-      else
-        best_tx_diff[i] = best_rd - best_tx_rd[i];
-    }
   } else {
     vp9_zero(best_filter_diff);
-    vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  store_coding_context(x, ctx, best_mode_index,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
+  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
+  // updating code causes PSNR loss. Need to figure out the confliction.
+  x->skip |= best_mode_skippable;
 
-  return best_rd;
+  if (!x->skip && !x->select_tx_size) {
+    int has_high_freq_coeff = 0;
+    int plane;
+    int max_plane = is_inter_block(&xd->mi[0]->mbmi)
+                        ? MAX_MB_PLANE : 1;
+    for (plane = 0; plane < max_plane; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    best_mode_skippable |= !has_high_freq_coeff;
+  }
+
+  assert(best_mode_index >= 0);
+
+  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+                       best_filter_diff, best_mode_skippable);
 }
 
-int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int *returnrate,
-                                           int64_t *returndistortion,
-                                           BLOCK_SIZE bsize,
-                                           PICK_MODE_CONTEXT *ctx,
-                                           int64_t best_rd_so_far) {
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
+                                        TileDataEnc *tile_data,
+                                        MACROBLOCK *x,
+                                        RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
-  int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -3128,9 +3619,9 @@
   for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
     x->pred_mv_sad[i] = INT_MAX;
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
-  assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
@@ -3139,12 +3630,6 @@
   mbmi->mv[0].as_int = 0;
   x->skip = 1;
 
-  // Search for best switchable filter by checking the variance of
-  // pred error irrespective of whether the filter will be used
-  rd_opt->mask_filter = 0;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    rd_opt->filter_cache[i] = INT64_MAX;
-
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
@@ -3153,7 +3638,7 @@
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filter;
@@ -3164,7 +3649,7 @@
   // Set the appropriate filter
   if (cm->interp_filter == SWITCHABLE) {
     mbmi->interp_filter = best_filter;
-    rate2 += vp9_get_switchable_rate(cpi);
+    rate2 += vp9_get_switchable_rate(cpi, xd);
   } else {
     mbmi->interp_filter = cm->interp_filter;
   }
@@ -3177,39 +3662,42 @@
   rate2 += ref_costs_single[LAST_FRAME];
   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
 
-  *returnrate = rate2;
-  *returndistortion = distortion2;
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
 
-  if (this_rd >= best_rd_so_far)
-    return INT64_MAX;
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
 
-  update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp9_zero(best_pred_diff);
   vp9_zero(best_filter_diff);
-  vp9_zero(best_tx_diff);
 
   if (!x->select_tx_size)
     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
-
-  return this_rd;
+                       best_pred_diff, best_filter_diff, 0);
 }
 
-int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                      const TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int *returnrate,
-                                      int64_t *returndistortion,
-                                      BLOCK_SIZE bsize,
-                                      PICK_MODE_CONTEXT *ctx,
-                                      int64_t best_rd_so_far) {
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   int mi_row, int mi_col,
+                                   RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize,
+                                   PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
@@ -3222,7 +3710,6 @@
                                     VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
-  static const int64_t best_tx_diff[TX_MODES] = { 0 };
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
@@ -3230,24 +3717,30 @@
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
-  int64_t best_inter_rd = INT64_MAX;
-  MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
+  vpx_prob comp_mode_p;
   INTERP_FILTER tmp_best_filter = SWITCHABLE;
   int rate_uv_intra, rate_uv_tokenonly;
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
-  int mode_skip_mask = 0;
+  int ref_frame_skip_mask[2] = { 0 };
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
 
-  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp9_zero(best_mbmode);
 
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
   for (i = 0; i < 4; i++) {
     int j;
     for (j = 0; j < MAX_REF_FRAMES; j++)
@@ -3263,14 +3756,16 @@
     best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
-  *returnrate = INT_MAX;
+  rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV],
-                             yv12_mb);
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb);
+    } else {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -3294,20 +3789,21 @@
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
       if (ref_index == 3) {
-        switch (vp9_ref_order[best_ref_index].ref_frame[0]) {
+        switch (best_mbmode.ref_frame[0]) {
           case INTRA_FRAME:
-            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            mode_skip_mask = 0x0010;
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
           case GOLDEN_FRAME:
-            mode_skip_mask = 0x0008;
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
           case ALTREF_FRAME:
-            mode_skip_mask = 0x0000;
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
             break;
           case NONE:
           case MAX_REF_FRAMES:
@@ -3315,35 +3811,32 @@
             break;
         }
       }
-      if (mode_skip_mask & (1 << ref_index))
-        continue;
     }
 
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
+      continue;
+
     // Test best rd so far against threshold for trying this mode.
-    if (rd_less_than_thresh(best_rd,
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            rd_opt->thresh_freq_fact[bsize][ref_index]))
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
 
-    if (ref_frame > INTRA_FRAME &&
-        !(cpi->ref_frame_flags & flag_list[ref_frame])) {
-      continue;
-    }
-
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME)
-        continue;
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
-          ref_frame != best_inter_ref_frame &&
-          second_ref_frame != best_inter_ref_frame)
+
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
     }
 
@@ -3364,15 +3857,13 @@
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative. We allow near/nearest as well
@@ -3409,7 +3900,7 @@
       distortion2 += distortion_y;
 
       if (rate_uv_intra == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
                              &rate_uv_intra,
                              &rate_uv_tokenonly,
                              &dist_uv, &skip_uv,
@@ -3430,7 +3921,7 @@
       int tmp_best_skippable = 0;
       int switchable_filter_index;
       int_mv *second_ref = comp_pred ?
-                             &mbmi->ref_mvs[second_ref_frame][0] : NULL;
+                             &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
       b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
@@ -3442,18 +3933,17 @@
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      rd_opt->mask_filter = 0;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-        rd_opt->filter_cache[i] = INT64_MAX;
+        filter_cache[i] = INT64_MAX;
 
       if (cm->interp_filter != BILINEAR) {
         tmp_best_filter = EIGHTTAP;
-        if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+        if (x->source_variance < sf->disable_filter_search_var_thresh) {
           tmp_best_filter = EIGHTTAP;
-        } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
+        } else if (sf->adaptive_pred_interp_filter == 1 &&
                    ctx->pred_interp_filter < SWITCHABLE) {
           tmp_best_filter = ctx->pred_interp_filter;
-        } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
+        } else if (sf->adaptive_pred_interp_filter == 2) {
           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
                               ctx->pred_interp_filter : 0;
         } else {
@@ -3462,9 +3952,10 @@
                ++switchable_filter_index) {
             int newbest, rs;
             int64_t rs_rd;
+            MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
             mbmi->interp_filter = switchable_filter_index;
-            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
-                                              &mbmi->ref_mvs[ref_frame][0],
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                              &mbmi_ext->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
                                               &rate_y, &distortion,
                                               &skippable, &total_sse,
@@ -3474,16 +3965,16 @@
 
             if (tmp_rd == INT64_MAX)
               continue;
-            rs = vp9_get_switchable_rate(cpi);
+            rs = vp9_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
-            rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-                MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
+            filter_cache[switchable_filter_index] = tmp_rd;
+            filter_cache[SWITCHABLE_FILTERS] =
+                MIN(filter_cache[SWITCHABLE_FILTERS],
                     tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
+            mask_filter = MAX(mask_filter, tmp_rd);
 
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
@@ -3506,7 +3997,7 @@
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
-                  cpi->sf.use_rd_breakout &&
+                  sf->use_rd_breakout &&
                   best_rd < INT64_MAX) {
                 if (tmp_best_rdu / 2 > best_rd) {
                   // skip searching the other filters if the first is
@@ -3529,8 +4020,8 @@
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
-                                          &mbmi->ref_mvs[ref_frame][0],
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                          &x->mbmi_ext->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
@@ -3552,7 +4043,7 @@
       distortion2 += distortion;
 
       if (cm->interp_filter == SWITCHABLE)
-        rate2 += vp9_get_switchable_rate(cpi);
+        rate2 += vp9_get_switchable_rate(cpi, xd);
 
       if (!mode_excluded)
         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -3569,10 +4060,11 @@
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                         &uv_sse, BLOCK_8X8, tmp_best_rdu);
-        if (rate_uv == INT_MAX)
+        memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
           continue;
+
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -3619,15 +4111,6 @@
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
-    // Keep record of best inter rd with single reference
-    if (is_inter_block(mbmi) &&
-        !has_second_ref(mbmi) &&
-        !mode_excluded &&
-        this_rd < best_inter_rd) {
-      best_inter_rd = this_rd;
-      best_inter_ref_frame = ref_frame;
-    }
-
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
@@ -3648,8 +4131,9 @@
           max_plane = 1;
         }
 
-        *returnrate = rate2;
-        *returndistortion = distortion2;
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_yrd = best_rd -
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
@@ -3657,19 +4141,24 @@
         best_skip2 = this_skip2;
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
-        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
-                   sizeof(uint8_t) * ctx->num_4x4_blk);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
           best_bmodes[i] = xd->mi[0]->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+        if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (ref_index > MIN_EARLY_TERM_INDEX)) {
-          const int qstep = xd->plane[0].dequant[1];
+          int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           if (x->source_variance < UINT_MAX) {
             const int var_adjust = (x->source_variance < 16);
             scale -= var_adjust;
@@ -3697,11 +4186,11 @@
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
         best_pred_rd[SINGLE_REFERENCE] = single_rd;
-      } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+      else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
-      }
+
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
@@ -3709,20 +4198,20 @@
     /* keep record of best filter type */
     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
         cm->interp_filter != BILINEAR) {
-      int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->interp_filter];
       int64_t adj_rd;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         if (ref == INT64_MAX)
           adj_rd = 0;
-        else if (rd_opt->filter_cache[i] == INT64_MAX)
+        else if (filter_cache[i] == INT64_MAX)
           // when early termination is triggered, the encoder does not have
           // access to the rate-distortion cost. it only knows that the cost
           // should be above the maximum valid value. hence it takes the known
           // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = rd_opt->mask_filter - ref + 10;
+          adj_rd = mask_filter - ref + 10;
         else
-          adj_rd = rd_opt->filter_cache[i] - ref;
+          adj_rd = filter_cache[i] - ref;
 
         adj_rd += this_rd;
         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -3736,13 +4225,16 @@
       break;
   }
 
-  if (best_rd >= best_rd_so_far)
-    return INT64_MAX;
+  if (best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
 
   // If we used an estimate for the uv intra rd in the loop above...
-  if (cpi->sf.use_uv_intra_rd_estimate) {
+  if (sf->use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) {
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       *mbmi = best_mbmode;
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
                               &rate_uv_tokenonly,
@@ -3753,16 +4245,18 @@
   }
 
   if (best_rd == INT64_MAX) {
-    *returnrate = INT_MAX;
-    *returndistortion = INT64_MAX;
-    return best_rd;
+    rd_cost->rate = INT_MAX;
+    rd_cost->dist = INT64_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
   }
 
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  update_rd_thresh_fact(cpi, bsize, best_ref_index);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -3772,7 +4266,7 @@
       xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
     for (i = 0; i < 4; ++i)
-      vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
@@ -3798,9 +4292,6 @@
     vp9_zero(best_filter_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_ref_index,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
-
-  return best_rd;
+                       best_pred_diff, best_filter_diff, 0);
 }

diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index 52c603f..00ee55c 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h

@@ -23,37 +23,49 @@
 struct TileInfo;
 struct VP9_COMP;
 struct macroblock;
+struct RD_COST;
 
 void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                               int *r, int64_t *d, BLOCK_SIZE bsize,
+                               struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                                  const struct TileInfo *const tile,
-                                  int mi_row, int mi_col,
-                                  int *returnrate,
-                                  int64_t *returndistortion,
-                                  BLOCK_SIZE bsize,
-                                  PICK_MODE_CONTEXT *ctx,
-                                  int64_t best_rd_so_far);
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
 
-int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
-                                           struct macroblock *x,
-                                           int *returnrate,
-                                           int64_t *returndistortion,
-                                           BLOCK_SIZE bsize,
-                                           PICK_MODE_CONTEXT *ctx,
-                                           int64_t best_rd_so_far);
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x,
+                               int mi_row, int mi_col,
+                               struct RD_COST *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
 
-int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
-                                      struct macroblock *x,
-                                      const struct TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int *returnrate,
-                                      int64_t *returndistortion,
-                                      BLOCK_SIZE bsize,
-                                      PICK_MODE_CONTEXT *ctx,
-                                      int64_t best_rd_so_far);
+void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                        struct TileDataEnc *tile_data,
+                                        struct macroblock *x,
+                                        struct RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far);
+
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
+void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
+                                   struct macroblock *x,
+                                   int mi_row, int mi_col,
+                                   struct RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_resize.c b/libvpx/vp9/encoder/vp9_resize.c
index 4e6efae..59c7478 100644
--- a/libvpx/vp9/encoder/vp9_resize.c
+++ b/libvpx/vp9/encoder/vp9_resize.c

@@ -15,6 +15,10 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
 
@@ -28,7 +32,7 @@
 typedef int16_t interp_kernel[INTERP_TAPS];
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
   {-3,  0, 35, 64, 35,  0, -3, 0},
   {-3, -1, 34, 64, 36,  1, -3, 0},
   {-3, -1, 32, 64, 38,  1, -3, 0},
@@ -64,7 +68,7 @@
 };
 
 // Filters for interpolation (0.625-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
   {-1, -8, 33, 80, 33, -8, -1, 0},
   {-1, -8, 30, 80, 35, -8, -1, 1},
   {-1, -8, 28, 80, 37, -7, -2, 1},
@@ -100,7 +104,7 @@
 };
 
 // Filters for interpolation (0.75-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
   {2, -11,  25,  96,  25, -11,   2, 0},
   {2, -11,  22,  96,  28, -11,   2, 0},
   {2, -10,  19,  95,  31, -11,   2, 0},
@@ -136,7 +140,7 @@
 };
 
 // Filters for interpolation (0.875-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
   {3,  -8,  13, 112,  13,  -8,   3, 0},
   {3,  -7,  10, 112,  17,  -9,   3, -1},
   {2,  -6,   7, 111,  21,  -9,   3, -1},
@@ -172,7 +176,7 @@
 };
 
 // Filters for interpolation (full-band) - no filtering for integer pixels
-const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {0,   1,  -3, 128,   3,  -1,   0, 0},
   {-1,   2,  -6, 127,   7,  -2,   1, 0},
@@ -214,15 +218,15 @@
 static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
   int outlength16 = outlength * 16;
   if (outlength16 >= inlength * 16)
-    return vp9_filteredinterp_filters1000;
+    return filteredinterp_filters1000;
   else if (outlength16 >= inlength * 13)
-    return vp9_filteredinterp_filters875;
+    return filteredinterp_filters875;
   else if (outlength16 >= inlength * 11)
-    return vp9_filteredinterp_filters750;
+    return filteredinterp_filters750;
   else if (outlength16 >= inlength * 9)
-    return vp9_filteredinterp_filters625;
+    return filteredinterp_filters625;
   else
-    return vp9_filteredinterp_filters500;
+    return filteredinterp_filters500;
 }
 
 static void interpolate(const uint8_t *const input, int inlength,
@@ -312,7 +316,7 @@
 static void down2_symeven(const uint8_t *const input, int length,
                           uint8_t *output) {
   // Actual filter len = 2 * filter_len_half.
-  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int16_t *filter = vp9_down2_symeven_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -368,7 +372,7 @@
 static void down2_symodd(const uint8_t *const input, int length,
                          uint8_t *output) {
   // Actual filter len = 2 * filter_len_half - 1.
-  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int16_t *filter = vp9_down2_symodd_half_filter;
   const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
   int i, j;
   uint8_t *optr = output;
@@ -427,7 +431,7 @@
   return length;
 }
 
-int get_down2_steps(int in_length, int out_length) {
+static int get_down2_steps(int in_length, int out_length) {
   int steps = 0;
   int proj_in_length;
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
@@ -444,7 +448,7 @@
                              uint8_t *buf) {
   int steps;
   if (length == olength) {
-    memcpy(output, input, sizeof(uint8_t) * length);
+    memcpy(output, input, sizeof(output[0]) * length);
     return;
   }
   steps = get_down2_steps(length, olength);
@@ -516,6 +520,10 @@
   uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
                                       (width < height ? height : width));
   uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
   for (i = 0; i < height; ++i)
     resize_multistep(input + in_stride * i, width,
                         intbuf + width2 * i, width2, tmpbuf);
@@ -529,6 +537,302 @@
   free(arrbuf);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+            input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] *
+            input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                   0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                              uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+                                    int length,
+                                    uint16_t *output,
+                                    int olength,
+                                    uint16_t *buf,
+                                    int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *tmpbuf = NULL;
+    uint16_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+                                        (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+                            bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp9_resize_frame420(const uint8_t *const y,
                          int y_stride,
                          const uint8_t *const u, const uint8_t *const v,
@@ -574,3 +878,51 @@
   vp9_resize_plane(v, height, width, uv_stride,
                    ov, oheight, owidth, ouv_stride);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+                          ou, oheight / 2, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+                          ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width / 2, uv_stride,
+                          ou, oheight, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width / 2, uv_stride,
+                          ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width, uv_stride,
+                          ou, oheight, owidth, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width, uv_stride,
+                          ov, oheight, owidth, ouv_stride, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vp9/encoder/vp9_resize.h b/libvpx/vp9/encoder/vp9_resize.h
index 1818cd4..067af53 100644
--- a/libvpx/vp9/encoder/vp9_resize.h
+++ b/libvpx/vp9/encoder/vp9_resize.h

@@ -65,4 +65,60 @@
                          int oheight,
                          int owidth);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+#endif    // CONFIG_VP9_HIGHBITDEPTH
 #endif    // VP9_ENCODER_VP9_RESIZE_H_

diff --git a/libvpx/vp9/encoder/vp9_sad.c b/libvpx/vp9/encoder/vp9_sad.c
deleted file mode 100644
index d062636..0000000
--- a/libvpx/vp9/encoder/vp9_sad.c
+++ /dev/null

@@ -1,133 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-#include "vp9/encoder/vp9_variance.h"
-
-static INLINE unsigned int sad(const uint8_t *a, int a_stride,
-                               const uint8_t *b, int b_stride,
-                               int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  return sad;
-}
-
-#define sadMxN(m, n) \
-unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride) { \
-  return sad(src, src_stride, ref, ref_stride, m, n); \
-} \
-unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                      const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred) { \
-  uint8_t comp_pred[m * n]; \
-  vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
-  return sad(src, src_stride, comp_pred, m, m, n); \
-}
-
-#define sadMxNxK(m, n, k) \
-void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                const uint8_t *ref, int ref_stride, \
-                                unsigned int *sads) { \
-  int i; \
-  for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
-}
-
-#define sadMxNx4D(m, n) \
-void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                             const uint8_t *const refs[], int ref_stride, \
-                             unsigned int *sads) { \
-  int i; \
-  for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
-}
-
-// 64x64
-sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
-sadMxNx4D(64, 64)
-
-// 64x32
-sadMxN(64, 32)
-sadMxNx4D(64, 32)
-
-// 32x64
-sadMxN(32, 64)
-sadMxNx4D(32, 64)
-
-// 32x32
-sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
-sadMxNx4D(32, 32)
-
-// 32x16
-sadMxN(32, 16)
-sadMxNx4D(32, 16)
-
-// 16x32
-sadMxN(16, 32)
-sadMxNx4D(16, 32)
-
-// 16x16
-sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
-sadMxNx4D(16, 16)
-
-// 16x8
-sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
-sadMxNx4D(16, 8)
-
-// 8x16
-sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
-sadMxNx4D(8, 16)
-
-// 8x8
-sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
-sadMxNx4D(8, 8)
-
-// 8x4
-sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
-sadMxNx4D(8, 4)
-
-// 4x8
-sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
-sadMxNx4D(4, 8)
-
-// 4x4
-sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
-sadMxNx4D(4, 4)

diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index d5676c3..c5c50a2 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c

@@ -36,11 +36,7 @@
                           unsigned char abs_delta) {
   seg->abs_delta = abs_delta;
 
-  vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-
-  // TBD ?? Set the feature mask
-  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
-  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
+  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
@@ -53,7 +49,7 @@
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) {
+static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
   // Work out probabilities of each segment
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
@@ -70,7 +66,7 @@
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(int *segcounts, vp9_prob *probs) {
+static int cost_segmap(int *segcounts, vpx_prob *probs) {
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
   const int c45 = segcounts[4] + segcounts[5];
@@ -133,8 +129,8 @@
   if (cm->frame_type != KEY_FRAME) {
     const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
-    const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
-                                                   bsize, mi_row, mi_col);
+    const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map,
+                                               bsize, mi_row, mi_col);
     const int pred_flag = pred_segment_id == segment_id;
     const int pred_context = vp9_get_pred_context_seg_id(xd);
 
@@ -211,14 +207,14 @@
   int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
 
-  vp9_prob no_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_nopred_prob[PREDICTION_PROBS];
+  vpx_prob no_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_nopred_prob[PREDICTION_PROBS];
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
-  vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
@@ -267,11 +263,11 @@
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     seg->temporal_update = 1;
-    vpx_memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+    memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
     seg->temporal_update = 0;
-    vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+    memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
 
@@ -280,6 +276,6 @@
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
-  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
   vp9_clearall_segfeatures(seg);
 }

diff --git a/libvpx/vp9/encoder/vp9_skin_detection.c b/libvpx/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 0000000..aaa8ea0
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_skin_detection.c

@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+  if (y < y_low || y > y_high)
+    return 0;
+  else
+    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col, num_bl;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  int y_bsize = 16;  // Use 8x8 or 16x16.
+  int uv_bsize = y_bsize >> 1;
+  int ypos = y_bsize >> 1;
+  int uvpos = uv_bsize >> 1;
+  int shy = (y_bsize == 8) ? 3 : 4;
+  int shuv = shy - 1;
+  int fac = y_bsize / 8;
+  // Use center pixel or average of center 2x2 pixels.
+  int mode_filter = 1;
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+  if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+      vpx_free_frame_buffer(&skinmap);
+      return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
+    num_bl = 0;
+    for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
+      // Select pixel for each block for skin detection.
+      // Use center pixel, or 2x2 average at center.
+      uint8_t ysource = src_y[ypos * src_ystride + ypos];
+      uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
+      uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
+      uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
+      uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
+      uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
+      uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
+      uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
+      uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
+      uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
+      uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
+      uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
+      if (mode_filter == 1) {
+        ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
+        usource = (usource + usource2 + usource3 + usource4) >> 2;
+        vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
+      }
+      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      for (i = 0; i < y_bsize; i++) {
+        for (j = 0; j < y_bsize; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += y_bsize;
+      src_y += y_bsize;
+      src_u += uv_bsize;
+      src_v += uv_bsize;
+    }
+    y += (src_ystride << shy) - (num_bl << shy);
+    src_y += (src_ystride << shy) - (num_bl << shy);
+    src_u += (src_uvstride << shuv) - (num_bl << shuv);
+    src_v += (src_uvstride << shuv) - (num_bl << shuv);
+  }
+  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif

diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 0000000..3d4e737
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_skin_detection.h

@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
+#define VP9_ENCODER_VP9_SKIN_MAP_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_

diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
index 4fe3aac..5e72c4c 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libvpx/vp9/encoder/vp9_speed_features.c

@@ -12,61 +12,119 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
 
-enum {
-  INTRA_ALL       = (1 << DC_PRED) |
-                    (1 << V_PRED) | (1 << H_PRED) |
-                    (1 << D45_PRED) | (1 << D135_PRED) |
-                    (1 << D117_PRED) | (1 << D153_PRED) |
-                    (1 << D207_PRED) | (1 << D63_PRED) |
-                    (1 << TM_PRED),
-  INTRA_DC        = (1 << DC_PRED),
-  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
-  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
-  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
-                    (1 << H_PRED)
-};
 
-enum {
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
-  INTER_NEAREST = (1 << NEARESTMV),
-  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
-};
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
+}
 
-enum {
-  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD) |
-                              (1 << THR_LAST),
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
+  unsigned int screen_area = (cm->width * cm->height);
 
-  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
 
-  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  VP9_COMMON *const cm = &cpi->common;
 
-  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD)
-};
+  if (speed >= 1) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    }
+  }
+
+  if (speed >= 2) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+  }
+
+  if (speed >= 3) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  // If this is a two pass clip that fits the criteria for animated or
+  // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
+  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp9_internal_image_edge(cpi)))) {
+    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+  }
+
+  if (speed >= 4) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
 
 static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
                                    SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
   sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
   sf->allow_skip_recode = 1;
 
   if (speed >= 1) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
-    sf->less_rectangular_check  = 1;
-    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
-                                                      : USE_LARGESTALL;
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        vp9_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
 
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    sf->less_rectangular_check  = 1;
+
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->mv.auto_mv_step_size = 1;
@@ -80,71 +138,65 @@
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+    sf->tx_size_search_breakout = 1;
+    sf->partition_search_breakout_rate_thr = 80;
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720) {
-      sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
-      sf->last_partitioning_redo_frequency = 3;
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-      sf->adaptive_pred_interp_filter = 0;
-    } else {
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-      sf->last_partitioning_redo_frequency = 2;
-      sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
-    }
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
 
-    sf->reference_masking = 1;
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+    // Reference masking is not supported in dynamic scaling mode.
+    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->adjust_partitioning_from_last_frame = 1;
+    sf->allow_partition_search_skip = 1;
   }
 
   if (speed >= 3) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->adaptive_pred_interp_filter = 0;
-    sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
-    sf->motion_field_mode_search = frame_is_boosted(cpi) ? 0 : 1;
-
-    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
-    sf->last_partitioning_redo_frequency = 3;
+    sf->alt_ref_search_fp = 1;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+    sf->adaptive_interp_filter_search = 1;
   }
 
   if (speed >= 4) {
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->adaptive_rd_thresh = 4;
-    sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
-                                  FLAG_EARLY_TERMINATE;
+    if (cm->frame_type != KEY_FRAME)
+      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->use_lp32x32fdct = 1;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
+    sf->motion_field_mode_search = !boosted;
+    sf->partition_search_breakout_rate_thr = 300;
   }
 
   if (speed >= 5) {
     int i;
-
-    sf->partition_search_type = FIXED_PARTITION;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -152,18 +204,53 @@
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-  }
-  if (speed >= 6) {
+    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
+    SPEED_FEATURES *sf, int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    }
+  }
+
+  if (speed >= 2) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+    }
+  }
+
+  if (speed >= 5) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    }
+  }
+
+  if (speed >= 7) {
+    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+        800 : 300;
   }
 }
 
 static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
-  const int frames_since_key =
-      cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
+  const int is_keyframe = cm->frame_type == KEY_FRAME;
+  const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
@@ -174,13 +261,8 @@
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
 
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
     sf->use_rd_breakout = 1;
+
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 1;
     sf->mv.auto_mv_step_size = 1;
@@ -191,22 +273,23 @@
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
-    sf->reference_masking = 1;
+
+    // Disable reference masking if using spatial scaling since
+    // pred_mv_sad will not be set (since vp9_mv_pred will not
+    // be called).
+    // TODO(marpan/agrange): Fix this condition.
+    sf->reference_masking = (cpi->oxcf.resize_mode != RESIZE_DYNAMIC &&
+                             cpi->svc.number_spatial_layers == 1) ? 1 : 0;
+
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -218,12 +301,9 @@
   if (speed >= 3) {
     sf->use_square_partition_only = 1;
     sf->disable_filter_search_var_thresh = 100;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->constrain_copy_partition = 1;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
     sf->mv.subpel_iters_per_step = 1;
-    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
     sf->allow_skip_recode = 0;
@@ -249,6 +329,7 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
+
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
@@ -258,68 +339,107 @@
   }
 
   if (speed >= 5) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
-    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
-        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
-    sf->max_partition_size = BLOCK_32X32;
-    sf->min_partition_size = BLOCK_8X8;
-    sf->partition_check =
-        (frames_since_key % sf->last_partitioning_redo_frequency == 1);
-    sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
-        (frames_since_key %
-            (sf->last_partitioning_redo_frequency << 1) == 1);
-    sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+    sf->use_quant_fp = !is_keyframe;
+    sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+                                                  : STRICT_NEIGHBORING_MIN_MAX;
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->force_frame_boost = is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+    sf->adaptive_rd_thresh = 2;
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
+    sf->partition_search_breakout_rate_thr = 200;
+    sf->coeff_prob_appx_step = 4;
+    sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+    sf->simple_model_rd_from_var = 1;
+
+    if (!is_keyframe) {
+      int i;
+      if (content == VP9E_CONTENT_SCREEN) {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+      } else {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          if (i >= BLOCK_16X16)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+          else
+            // Use H and V intra mode for block sizes <= 16X16.
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+      }
+    }
   }
 
   if (speed >= 6) {
-    if (content == VP9E_CONTENT_SCREEN) {
-      int i;
-      // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
-      for (i = 0; i < BLOCK_SIZES; ++i)
-        sf->inter_mode_mask[i] = INTER_ALL;
-    }
-
     // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
-    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
-    sf->search_type_check_frequency = 50;
-
-    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
-        USE_LARGESTALL : USE_TX_8X8;
-
-    // This feature is only enabled when partition search is disabled.
-    sf->reuse_inter_pred_sby = 1;
-
-    // Increase mode checking threshold for NEWMV.
-    sf->elevate_newmv_thresh = 2000;
-
+    sf->partition_search_type = VAR_BASED_PARTITION;
+    // Turn on this to use non-RD key frame coding mode.
+    sf->use_nonrd_pick_mode = 1;
+    sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
+    sf->skip_encode_sb = 0;
   }
+
   if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
-    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
-    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
-        800 : 300;
-    sf->elevate_newmv_thresh = 2500;
+    if (cpi->svc.number_temporal_layers > 2 &&
+        cpi->svc.temporal_layer_id == 0) {
+      sf->mv.search_method = NSTEP;
+      sf->mv.fullpel_search_step_param = 6;
+    }
   }
-  if (speed >= 12) {
-    sf->elevate_newmv_thresh = 4000;
+  if (speed >= 8) {
+    sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = 2;
-  }
-  if (speed >= 13) {
-    int i;
-    sf->max_intra_bsize = BLOCK_32X32;
-    for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->inter_mode_mask[i] = INTER_NEAREST;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
   }
 }
 
-void vp9_set_speed_features(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  } else if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  }
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout) {
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
+  }
+}
+
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
@@ -332,35 +452,39 @@
   sf->mv.subpel_force_stop = 0;
   sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->mv.reduce_first_step_size = 0;
+  sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
   sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
+  sf->alt_ref_search_fp = 0;
   sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
-  sf->max_partition_size = BLOCK_64X64;
-  sf->min_partition_size = BLOCK_4X4;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->constrain_copy_partition = 0;
   sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->force_frame_boost = 0;
   sf->max_delta_qindex = 0;
-  sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
+  sf->adaptive_interp_filter_search = 0;
+  sf->allow_partition_search_skip = 0;
+
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
     sf->intra_uv_mode_mask[i] = INTRA_ALL;
@@ -373,6 +497,7 @@
   sf->use_fast_coef_updates = TWO_LOOP;
   sf->use_fast_coef_costing = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->schedule_mode_search = 0;
   sf->use_nonrd_pick_mode = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
     sf->inter_mode_mask[i] = INTER_ALL;
@@ -383,25 +508,22 @@
   sf->always_this_block_size = BLOCK_16X16;
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
-  sf->elevate_newmv_thresh = 0;
-  // Recode loop tolerence %.
+  // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
+  sf->tx_size_search_breakout = 0;
+  sf->partition_search_breakout_dist_thr = 0;
+  sf->partition_search_breakout_rate_thr = 0;
+  sf->simple_model_rd_from_var = 0;
 
-  switch (oxcf->mode) {
-    case ONE_PASS_BEST:
-    case TWO_PASS_SECOND_BEST:  // This is the best quality mode.
-      cpi->diamond_search_sad = vp9_full_range_search;
-      break;
-    case TWO_PASS_FIRST:
-    case ONE_PASS_GOOD:
-    case TWO_PASS_SECOND_GOOD:
-      set_good_speed_feature(cpi, cm, sf, oxcf->speed);
-      break;
-    case REALTIME:
-      set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
-      break;
-  }
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+  cpi->full_search_sad = vp9_full_search_sad;
+  cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
+                                               : vp9_diamond_search_sad;
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -416,18 +538,20 @@
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
 
-  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
-    sf->adaptive_pred_interp_filter = 0;
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
 
   if (!cpi->oxcf.frame_periodic_boost) {
     sf->max_delta_qindex = 0;
   }
-
-  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
-      sf->encode_breakout_thresh > cpi->encode_breakout)
-    cpi->encode_breakout = sf->encode_breakout_thresh;
 }

diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h
index 243139d..95038ce 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libvpx/vp9/encoder/vp9_speed_features.h

@@ -17,6 +17,47 @@
 extern "C" {
 #endif
 
+enum {
+  INTRA_ALL       = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -40,6 +81,9 @@
 
 typedef enum {
   SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
@@ -49,12 +93,6 @@
 } MOTION_THRESHOLD;
 
 typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
   USE_FULL_RD = 0,
   USE_LARGESTALL,
   USE_TX_8X8
@@ -85,11 +123,6 @@
   // Skips comp inter modes if the best so far is an intra mode.
   FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
 
-  // Skips comp inter modes if the best single intermode so far does
-  // not have the same reference as one of the two references being
-  // tested.
-  FLAG_SKIP_COMP_REFMISMATCH = 1 << 2,
-
   // Skips oblique intra modes if the best so far is an inter mode.
   FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
 
@@ -102,17 +135,19 @@
 } MODE_SEARCH_SKIP_LOGIC;
 
 typedef enum {
+  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
   // Search partitions using RD/NONRD criterion
-  SEARCH_PARTITION = 0,
+  SEARCH_PARTITION,
 
   // Always use a fixed size partition
-  FIXED_PARTITION = 1,
+  FIXED_PARTITION,
 
-  // Use a fixed size partition in every 64X64 SB, where the size is
-  // determined based on source variance
-  VAR_BASED_FIXED_PARTITION = 2,
-
-  REFERENCE_PARTITION = 3,
+  REFERENCE_PARTITION,
 
   // Use an arbitrary partitioning scheme based on source variance within
   // a 64X64 SB
@@ -127,12 +162,9 @@
   // before the final run.
   TWO_LOOP = 0,
 
-  // No dry run conducted.
-  ONE_LOOP = 1,
-
   // No dry run, also only half the coef contexts and bands are updated.
   // The rest are not updated at all.
-  ONE_LOOP_REDUCED = 2
+  ONE_LOOP_REDUCED = 1
 } FAST_COEFF_UPDATE;
 
 typedef struct MV_SPEED_FEATURES {
@@ -200,14 +232,8 @@
   // level within a frame.
   int allow_skip_recode;
 
-  // This variable allows us to reuse the last frames partition choices
-  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
-  // frame as a starting point in low motion scenes or always use it. If set
-  // we use last partitioning_redo frequency to determine how often to redo
-  // the partitioning from scratch. Adjust_partitioning_from_last_frame
-  // enables us to adjust up or down one partitioning from the last frames
-  // partitioning.
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+  // Coefficient probability model approximation step size
+  int coeff_prob_appx_step;
 
   // The threshold is to determine how slow the motino is, it is used when
   // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
@@ -222,8 +248,6 @@
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
 
-  // TODO(JBB): remove this as its no longer used.
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -247,11 +271,14 @@
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
 
   // Min and max partition size we enable (block_size) as per auto
   // min max, but also used by adjust partitioning, and pick_partitioning.
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
 
   // Whether or not we allow partitions one smaller or one greater than the last
   // frame's partitioning. Only used if use_lastframe_partitioning is set.
@@ -261,12 +288,6 @@
   // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
 
-  // This enables constrained copy partitioning, which, given an input block
-  // size bsize, will copy previous partition for partitions less than bsize,
-  // otherwise bsize partition is used. bsize is currently set to 16x16.
-  // Used for the case where motion is detected in superblock.
-  int constrain_copy_partition;
-
   // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
   // it always, to allow it for only Last frame and Intra, disable it for all
   // inter modes or to enable it always.
@@ -277,12 +298,17 @@
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  int schedule_mode_search;
+
   // Allows sub 8x8 modes to use the prediction filter that was determined
   // best for 8x8 mode. If set to 0 we always re check all the filters for
   // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
   // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_interp_filter;
 
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
   // Chessboard pattern prediction filter type search
   int cb_pred_filter_search;
 
@@ -290,13 +316,11 @@
 
   int motion_field_mode_search;
 
+  int alt_ref_search_fp;
+
   // Fast quantization process path
   int use_quant_fp;
 
-  // Search through variable block partition types in non-RD mode decision
-  // encoding process for RTC.
-  int partition_check;
-
   // Use finer quantizer in every other few frames that run variable block
   // partition type search.
   int force_frame_boost;
@@ -309,9 +333,6 @@
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
 
-  // A source variance threshold below which the split mode is disabled
-  unsigned int disable_split_var_thresh;
-
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
@@ -321,6 +342,10 @@
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
 
+  // These bit masks allow you to enable or disable intra modes for each
+  // prediction block size separately.
+  int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
   // This variable enables an early break out of mode testing if the model for
   // rd built from the prediction signal indicates a value that's much
   // higher than the best rd we've seen so far.
@@ -371,20 +396,37 @@
   // enabled in real time mode.
   int encode_breakout_thresh;
 
-  // In real time encoding, increase the threshold for NEWMV.
-  int elevate_newmv_thresh;
-
   // default interp filter choice
   INTERP_FILTER default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // Fast approximation of vp9_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
 
-void vp9_set_speed_features(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
 #endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
-

diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
deleted file mode 100644
index 026e6a8..0000000
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ /dev/null

@@ -1,143 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-
-#include "vp9/encoder/vp9_ssim.h"
-
-void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
-                            int rp, unsigned long *sum_s, unsigned long *sum_r,
-                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,
-                            unsigned long *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
-                          unsigned long *sum_s, unsigned long *sum_r,
-                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,
-                          unsigned long *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
-
-static double similarity(unsigned long sum_s, unsigned long sum_r,
-                         unsigned long sum_sq_s, unsigned long sum_sq_r,
-                         unsigned long sum_sxr, int count) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
-
-  // scale the constants by number of pixels
-  c1 = (cc1 * count * count) >> 12;
-  c2 = (cc2 * count * count) >> 12;
-
-  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
-                                       (int64_t) 2 * sum_s * sum_r + c2);
-
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
-
-  return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) {
-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                     &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
-                 int stride_img2, int width, int height) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight) {
-  double a, b, c;
-  double ssimv;
-
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride,
-                source->y_crop_width, source->y_crop_height);
-
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-
-  ssimv = a * .8 + .1 * (b + c);
-
-  *weight = 1;
-
-  return ssimv;
-}
-
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                      double *ssim_y, double *ssim_u, double *ssim_v) {
-  double ssim_all = 0;
-  double a, b, c;
-
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride,
-                source->y_crop_width, source->y_crop_height);
-
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-  *ssim_y = a;
-  *ssim_u = b;
-  *ssim_v = c;
-  ssim_all = (a * 4 + b + c) / 6;
-
-  return ssim_all;
-}

diff --git a/libvpx/vp9/encoder/vp9_ssim.h b/libvpx/vp9/encoder/vp9_ssim.h
deleted file mode 100644
index a581c2c..0000000
--- a/libvpx/vp9/encoder/vp9_ssim.h
+++ /dev/null

@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_SSIM_H_
-#define VP9_ENCODER_VP9_SSIM_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "vpx_scale/yv12config.h"
-
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight);
-
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                      double *ssim_y, double *ssim_u, double *ssim_v);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_SSIM_H_

diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
index 530b592..799f179 100644
--- a/libvpx/vp9/encoder/vp9_subexp.c
+++ b/libvpx/vp9/encoder/vp9_subexp.c

@@ -7,12 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vpx_dsp/bitwriter.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
-
 #include "vp9/encoder/vp9_cost.h"
-#include "vp9/encoder/vp9_writer.h"
+#include "vp9/encoder/vp9_subexp.h"
 
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
@@ -78,50 +78,50 @@
   return i;
 }
 
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
   int delp = remap_prob(newp, oldp);
   return update_bits[delp] * 256;
 }
 
-static void encode_uniform(vp9_writer *w, int v) {
+static void encode_uniform(vpx_writer *w, int v) {
   const int l = 8;
   const int m = (1 << l) - 191;
   if (v < m) {
-    vp9_write_literal(w, v, l - 1);
+    vpx_write_literal(w, v, l - 1);
   } else {
-    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vp9_write_literal(w, (v - m) & 1, 1);
+    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vpx_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-static INLINE int write_bit_gte(vp9_writer *w, int word, int test) {
-  vp9_write_literal(w, word >= test, 1);
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+  vpx_write_literal(w, word >= test, 1);
   return word >= test;
 }
 
-static void encode_term_subexp(vp9_writer *w, int word) {
+static void encode_term_subexp(vpx_writer *w, int word) {
   if (!write_bit_gte(w, word, 16)) {
-    vp9_write_literal(w, word, 4);
+    vpx_write_literal(w, word, 4);
   } else if (!write_bit_gte(w, word, 32)) {
-    vp9_write_literal(w, word - 16, 4);
+    vpx_write_literal(w, word - 16, 4);
   } else if (!write_bit_gte(w, word, 64)) {
-    vp9_write_literal(w, word - 32, 5);
+    vpx_write_literal(w, word - 32, 5);
   } else {
     encode_uniform(w, word - 64);
   }
 }
 
-void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   const int delp = remap_prob(newp, oldp);
   encode_term_subexp(w, delp);
 }
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd) {
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd) {
   const int old_b = cost_branch256(ct, oldp);
   int bestsavings = 0;
-  vp9_prob newp, bestnewp = oldp;
+  vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
 
   for (newp = *bestp; newp != oldp; newp += step) {
@@ -138,14 +138,15 @@
 }
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd) {
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
-  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
   vp9_model_to_full_probs(oldp, oldplist);
-  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
     old_b += cost_branch256(ct + 2 * i, oldplist[i]);
   old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -153,40 +154,60 @@
   bestsavings = 0;
   bestnewp = oldp[PIVOT_NODE];
 
-  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
-
-  for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
-    if (newp < 1 || newp > 255)
-      continue;
-    newplist[PIVOT_NODE] = newp;
-    vp9_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-        vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (*bestp > oldp[PIVOT_NODE]) {
+    step = -stepsize;
+    for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
+      if (newp < 1 || newp > 255)
+        continue;
+      newplist[PIVOT_NODE] = newp;
+      vp9_model_to_full_probs(newplist, newplist);
+      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i]);
+      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+          vp9_cost_upd256;
+      savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
+    }
+  } else {
+    step = stepsize;
+    for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
+      if (newp < 1 || newp > 255)
+        continue;
+      newplist[PIVOT_NODE] = newp;
+      vp9_model_to_full_probs(newplist, newplist);
+      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i]);
+      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+          vp9_cost_upd256;
+      savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
+
   *bestp = bestnewp;
   return bestsavings;
 }
 
-void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
-  const vp9_prob upd = DIFF_UPDATE_PROB;
-  vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
   assert(newp >= 1);
   if (savings > 0) {
-    vp9_write(w, 1, upd);
+    vpx_write(w, 1, upd);
     vp9_write_prob_diff_update(w, newp, *oldp);
     *oldp = newp;
   } else {
-    vp9_write(w, 0, upd);
+    vpx_write(w, 0, upd);
   }
 }

diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
index 8e02a1d..b968232 100644
--- a/libvpx/vp9/encoder/vp9_subexp.h
+++ b/libvpx/vp9/encoder/vp9_subexp.h

@@ -16,21 +16,26 @@
 extern "C" {
 #endif
 
-void vp9_write_prob_diff_update(vp9_writer *w,
-                                vp9_prob newp, vp9_prob oldp);
+#include "vpx_dsp/prob.h"
 
-void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               unsigned int *ct);
+struct vpx_writer;
+
+void vp9_write_prob_diff_update(struct vpx_writer *w,
+                                vpx_prob newp, vpx_prob oldp);
+
+void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+                               const unsigned int ct[2]);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd);
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd);
 
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd);
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index bf949c4..e69404a 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c

@@ -14,68 +14,91 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_extend.h"
 
+#define SMALL_FRAME_FB_IDX 7
+#define SMALL_FRAME_WIDTH  32
+#define SMALL_FRAME_HEIGHT 16
+
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  int layer;
-  int layer_end;
+  int sl, tl;
   int alt_ref_idx = svc->number_spatial_layers;
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
 
-  if (svc->number_temporal_layers > 1) {
-    layer_end = svc->number_temporal_layers;
-  } else {
-    layer_end = svc->number_spatial_layers;
+  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
+                                 SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate empty frame for multiple frame "
+                         "contexts");
+
+    memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+           cpi->svc.empty_frame.img.buffer_alloc_sz);
   }
 
-  for (layer = 0; layer < layer_end; ++layer) {
-    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-    RATE_CONTROL *const lrc = &lc->rc;
-    int i;
-    lc->current_video_frame_in_layer = 0;
-    lc->layer_size = 0;
-    lrc->ni_av_qi = oxcf->worst_allowed_q;
-    lrc->total_actual_bits = 0;
-    lrc->total_target_vs_actual = 0;
-    lrc->ni_tot_qi = 0;
-    lrc->tot_q = 0.0;
-    lrc->avg_q = 0.0;
-    lrc->ni_frames = 0;
-    lrc->decimation_count = 0;
-    lrc->decimation_factor = 0;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      int i;
+      lc->current_video_frame_in_layer = 0;
+      lc->layer_size = 0;
+      lc->frames_from_key_frame = 0;
+      lc->last_frame_type = FRAME_TYPES;
+      lrc->ni_av_qi = oxcf->worst_allowed_q;
+      lrc->total_actual_bits = 0;
+      lrc->total_target_vs_actual = 0;
+      lrc->ni_tot_qi = 0;
+      lrc->tot_q = 0.0;
+      lrc->avg_q = 0.0;
+      lrc->ni_frames = 0;
+      lrc->decimation_count = 0;
+      lrc->decimation_factor = 0;
 
-    for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-      lrc->rate_correction_factors[i] = 1.0;
-    }
+      for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+        lrc->rate_correction_factors[i] = 1.0;
+      }
 
-    if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
-      lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
-      lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
-    } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
-      lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
-      lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
-      lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
-                                          oxcf->best_allowed_q) / 2;
-      lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+        lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+        lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+        lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+      } else {
+        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+        lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+        lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
+        lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
                                             oxcf->best_allowed_q) / 2;
-      if (oxcf->ss_play_alternate[layer])
-        lc->alt_ref_idx = alt_ref_idx++;
-      else
-        lc->alt_ref_idx = -1;
-      lc->gold_ref_idx = -1;
-    }
+        lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                              oxcf->best_allowed_q) / 2;
+        if (oxcf->ss_enable_auto_arf[sl])
+          lc->alt_ref_idx = alt_ref_idx++;
+        else
+          lc->alt_ref_idx = INVALID_IDX;
+        lc->gold_ref_idx = INVALID_IDX;
+      }
 
-    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
-                                    lc->target_bandwidth, 1000);
-    lrc->bits_off_target = lrc->buffer_level;
+      lrc->buffer_level = oxcf->starting_buffer_level_ms *
+                              lc->target_bandwidth / 1000;
+      lrc->bits_off_target = lrc->buffer_level;
+    }
   }
 
   // Still have extra buffer for base layer golden frame
-  if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+  if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+      && alt_ref_idx < REF_FRAMES)
     svc->layer_context[0].gold_ref_idx = alt_ref_idx;
 }
 
@@ -85,72 +108,121 @@
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int layer;
-  int layer_end;
+  int sl, tl, layer = 0, spatial_layer_target;
   float bitrate_alloc = 1.0;
 
-  if (svc->number_temporal_layers > 1) {
-    layer_end = svc->number_temporal_layers;
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+      spatial_layer_target = 0;
+
+      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+        layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+        svc->layer_context[layer].target_bandwidth =
+            oxcf->layer_target_bitrate[layer];
+      }
+
+      layer = LAYER_IDS_TO_IDX(sl, ((oxcf->ts_number_layers - 1) < 0 ?
+          0 : (oxcf->ts_number_layers - 1)), oxcf->ts_number_layers);
+      spatial_layer_target =
+          svc->layer_context[layer].target_bandwidth =
+              oxcf->layer_target_bitrate[layer];
+
+      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+        LAYER_CONTEXT *const lc =
+            &svc->layer_context[sl * oxcf->ts_number_layers + tl];
+        RATE_CONTROL *const lrc = &lc->rc;
+
+        lc->spatial_layer_target_bandwidth = spatial_layer_target;
+        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+        lrc->starting_buffer_level =
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+        lrc->optimal_buffer_level =
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+        lrc->maximum_buffer_size =
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+        lrc->bits_off_target =
+            MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+        lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
+        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
+        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+        lrc->worst_quality = rc->worst_quality;
+        lrc->best_quality = rc->best_quality;
+      }
+    }
   } else {
-    layer_end = svc->number_spatial_layers;
-  }
+    int layer_end;
 
-  for (layer = 0; layer < layer_end; ++layer) {
-    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-    RATE_CONTROL *const lrc = &lc->rc;
+    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+      layer_end = svc->number_temporal_layers;
+    } else {
+      layer_end = svc->number_spatial_layers;
+    }
 
-    if (svc->number_temporal_layers > 1) {
-      lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
-    } else {
-      lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
+    for (layer = 0; layer < layer_end; ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+
+      lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+
+      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      // Update buffer-related quantities.
+      lrc->starting_buffer_level =
+          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+      lrc->optimal_buffer_level =
+          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+      lrc->maximum_buffer_size =
+          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+      lrc->bits_off_target = MIN(lrc->bits_off_target,
+                                 lrc->maximum_buffer_size);
+      lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      // Update framerate-related quantities.
+      if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
+      } else {
+        lc->framerate = cpi->framerate;
+      }
+      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      // Update qp-related quantities.
+      lrc->worst_quality = rc->worst_quality;
+      lrc->best_quality = rc->best_quality;
     }
-    bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
-    // Update buffer-related quantities.
-    lrc->starting_buffer_level =
-        (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-    lrc->optimal_buffer_level =
-        (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-    lrc->maximum_buffer_size =
-        (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-    lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
-    // Update framerate-related quantities.
-    if (svc->number_temporal_layers > 1) {
-      lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
-    } else {
-      lc->framerate = oxcf->framerate;
-    }
-    lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-    lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
-    // Update qp-related quantities.
-    lrc->worst_quality = rc->worst_quality;
-    lrc->best_quality = rc->best_quality;
   }
 }
 
-static LAYER_CONTEXT *get_layer_context(SVC *svc) {
-  return svc->number_temporal_layers > 1 ?
-         &svc->layer_context[svc->temporal_layer_id] :
-         &svc->layer_context[svc->spatial_layer_id];
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+  if (is_one_pass_cbr_svc(cpi))
+    return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+        cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id];
+  else
+    return (cpi->svc.number_temporal_layers > 1 &&
+            cpi->oxcf.rc_mode == VPX_CBR) ?
+             &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+             &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
 }
 
 void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   RATE_CONTROL *const lrc = &lc->rc;
-  const int layer = svc->temporal_layer_id;
+  // Index into spatial+temporal arrays.
+  const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers +
+      svc->temporal_layer_id;
+  const int tl = svc->temporal_layer_id;
 
-  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+  lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
   lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
-  if (layer == 0) {
+  if (tl == 0) {
     lc->avg_frame_size = lrc->avg_frame_bandwidth;
   } else {
     const double prev_layer_framerate =
-        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
-    const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
+        cpi->framerate / oxcf->ts_rate_decimator[tl - 1];
+    const int prev_layer_target_bandwidth =
+        oxcf->layer_target_bitrate[st_idx - 1];
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
               (lc->framerate - prev_layer_framerate));
@@ -159,7 +231,7 @@
 
 void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
@@ -168,11 +240,11 @@
                                    oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
                                    oxcf->two_pass_vbrmax_section) / 100);
-  vp9_rc_set_gf_max_interval(cpi, lrc);
+  vp9_rc_set_gf_interval_range(cpi, lrc);
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
 
@@ -190,7 +262,7 @@
 
 void vp9_save_layer_context(VP9_COMP *const cpi) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
 
   lc->rc = cpi->rc;
   lc->twopass = cpi->twopass;
@@ -214,71 +286,258 @@
   svc->spatial_layer_id = 0;
 }
 
-void vp9_inc_frame_in_layer(SVC *svc) {
-  LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
-      ? &svc->layer_context[svc->temporal_layer_id]
-      : &svc->layer_context[svc->spatial_layer_id];
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+  LAYER_CONTEXT *const lc =
+      &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                              cpi->svc.number_temporal_layers];
   ++lc->current_video_frame_in_layer;
+  ++lc->frames_from_key_frame;
 }
 
 int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
-  return is_spatial_svc(cpi) &&
+  return is_two_pass_svc(cpi) &&
          cpi->svc.spatial_layer_id > 0 &&
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                                cpi->svc.number_temporal_layers +
+                                cpi->svc.temporal_layer_id].is_key_frame;
 }
 
-#if CONFIG_SPATIAL_SVC
-int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx,
-                           YV12_BUFFER_CONFIG *src, int64_t ts_start,
-                           int64_t ts_end, unsigned int flags) {
-  struct lookahead_entry *buf;
-  int i, index;
+static void get_layer_resolution(const int width_org, const int height_org,
+                                 const int num, const int den,
+                                 int *width_out, int *height_out) {
+  int w, h;
 
-  if (vp9_lookahead_push(ctx, src, ts_start, ts_end, flags))
-    return 1;
+  if (width_out == NULL || height_out == NULL || den == 0)
+    return;
 
-  index = ctx->write_idx - 1;
-  if (index < 0)
-    index += ctx->max_sz;
+  w = width_org * num / den;
+  h = height_org * num / den;
 
-  buf = ctx->buf + index;
+  // make height and width even to make chrome player happy
+  w += w % 2;
+  h += h % 2;
 
-  if (buf == NULL)
-    return 1;
+  *width_out = w;
+  *height_out = h;
+}
 
-  // Store svc parameters for each layer
-  for (i = 0; i < cpi->svc.number_spatial_layers; ++i)
-    buf->svc_params[i] = cpi->svc.layer_context[i].svc_params_received;
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
+  int frame_num_within_temporal_struct = 0;
+  int spatial_id, temporal_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  frame_num_within_temporal_struct =
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+      cpi->svc.number_temporal_layers].current_video_frame_in_layer % 4;
+  temporal_id = cpi->svc.temporal_layer_id =
+      (frame_num_within_temporal_struct & 1) ? 2 :
+      (frame_num_within_temporal_struct >> 1);
+  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+      cpi->ext_refresh_alt_ref_frame = 0;
+  if (!temporal_id) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_last_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+      // base layer is a key frame.
+      cpi->ref_frame_flags = VP9_GOLD_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else if (temporal_id == 1) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_alt_ref_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else {
+    if (frame_num_within_temporal_struct == 1) {
+      // the first tl2 picture
+      if (!spatial_id) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ext_refresh_alt_ref_frame = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ext_refresh_alt_ref_frame = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      } else {  // Top layer
+        cpi->ext_refresh_frame_flags_pending = 0;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      }
+    } else {
+      //  The second tl2 picture
+      if (!spatial_id) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+        cpi->ext_refresh_last_frame = 1;
+      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+        cpi->ext_refresh_last_frame = 1;
+      } else {  // top layer
+        cpi->ext_refresh_frame_flags_pending = 0;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      }
+    }
+  }
+  if (temporal_id == 0) {
+    cpi->lst_fb_idx = spatial_id;
+    if (spatial_id)
+      cpi->gld_fb_idx = spatial_id - 1;
+    else
+      cpi->gld_fb_idx = 0;
+    cpi->alt_fb_idx = 0;
+  } else if (temporal_id == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  } else if (frame_num_within_temporal_struct == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  } else {
+    cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = 0;
+  }
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
+  int spatial_id, temporal_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  temporal_id = cpi->svc.temporal_layer_id =
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+      cpi->svc.number_temporal_layers].current_video_frame_in_layer & 1;
+  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+                                cpi->ext_refresh_alt_ref_frame = 0;
+  if (!temporal_id) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_last_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+      // base layer is a key frame.
+      cpi->ref_frame_flags = VP9_GOLD_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else if (temporal_id == 1) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_alt_ref_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  }
+
+  if (temporal_id == 0) {
+    cpi->lst_fb_idx = spatial_id;
+    if (spatial_id)
+      cpi->gld_fb_idx = spatial_id - 1;
+    else
+      cpi->gld_fb_idx = 0;
+    cpi->alt_fb_idx = 0;
+  } else if (temporal_id == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  }
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 0 - that has no temporal layering.
+static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
+    VP9_COMP *const cpi) {
+  int spatial_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  cpi->ext_refresh_last_frame =
+      cpi->ext_refresh_golden_frame = cpi->ext_refresh_alt_ref_frame = 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->ext_refresh_last_frame = 1;
+  if (!spatial_id) {
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+  } else if (cpi->svc.layer_context[0].is_key_frame) {
+    cpi->ref_frame_flags = VP9_GOLD_FLAG;
+  } else {
+    cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+  }
+  cpi->lst_fb_idx = spatial_id;
+  if (spatial_id)
+    cpi->gld_fb_idx = spatial_id - 1;
+  else
+    cpi->gld_fb_idx = 0;
+}
+
+int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+  int width = 0, height = 0;
+  LAYER_CONTEXT *lc = NULL;
+
+  if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+    set_flags_and_fb_idx_for_temporal_mode3(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+           VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+    set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+           VP9E_TEMPORAL_LAYERING_MODE_0101) {
+    set_flags_and_fb_idx_for_temporal_mode2(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+      VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    // VP9E_TEMPORAL_LAYERING_MODE_BYPASS :
+    // if the code goes here, it means the encoder will be relying on the
+    // flags from outside for layering.
+    // However, since when spatial+temporal layering is used, the buffer indices
+    // cannot be derived automatically, the bypass mode will only work when the
+    // number of spatial layers equals 1.
+    assert(cpi->svc.number_spatial_layers == 1);
+  }
+
+  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                               cpi->svc.number_temporal_layers +
+                               cpi->svc.temporal_layer_id];
+
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den,
+                       &width, &height);
+
+  if (vp9_set_size_literal(cpi, width, height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
 
   return 0;
 }
 
-static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
-  int layer_id;
-  vpx_svc_parameters_t *layer_param;
+#if CONFIG_SPATIAL_SVC
+int vp9_svc_start_frame(VP9_COMP *const cpi) {
+  int width = 0, height = 0;
   LAYER_CONTEXT *lc;
+  struct lookahead_entry *buf;
+  int count = 1 << (cpi->svc.number_temporal_layers - 1);
 
-  // Find the next layer to be encoded
-  for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
-    if (buf->svc_params[layer_id].spatial_layer >=0)
-      break;
+  cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+
+  cpi->svc.temporal_layer_id = 0;
+  while ((lc->current_video_frame_in_layer % count) != 0) {
+    ++cpi->svc.temporal_layer_id;
+    count >>= 1;
   }
 
-  if (layer_id == cpi->svc.number_spatial_layers)
-    return 1;
-
-  layer_param = &buf->svc_params[layer_id];
-  cpi->svc.spatial_layer_id = layer_param->spatial_layer;
-  cpi->svc.temporal_layer_id = layer_param->temporal_layer;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
   cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
 
-  if (cpi->svc.spatial_layer_id < 1)
-      cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
-                        lc->gold_ref_idx : cpi->lst_fb_idx;
+  if (cpi->svc.spatial_layer_id == 0)
+    cpi->gld_fb_idx = (lc->gold_ref_idx >= 0) ?
+                      lc->gold_ref_idx : cpi->lst_fb_idx;
   else
     cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
 
@@ -290,7 +549,7 @@
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
     }
   } else {
-    if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
+    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
       cpi->alt_fb_idx = lc->alt_ref_idx;
       if (!lc->has_alt_frame)
         cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
@@ -302,7 +561,7 @@
         LAYER_CONTEXT *lc_lower =
             &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
 
-        if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] &&
+        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
             lc_lower->alt_ref_source != NULL)
           cpi->alt_fb_idx = lc_lower->alt_ref_idx;
         else if (cpi->svc.spatial_layer_id >= 2)
@@ -313,52 +572,74 @@
     }
   }
 
-  if (vp9_set_size_literal(cpi, layer_param->width, layer_param->height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den,
+                       &width, &height);
 
-  cpi->oxcf.worst_allowed_q =
-      vp9_quantizer_to_qindex(layer_param->max_quantizer);
-  cpi->oxcf.best_allowed_q =
-      vp9_quantizer_to_qindex(layer_param->min_quantizer);
+  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
+  // since its previous frame could be changed during decoding time. The idea is
+  // we put a empty invisible frame in front of them, then we will not use
+  // prev_mi when encoding these frames.
+
+  buf = vp9_lookahead_peek(cpi->lookahead, 0);
+  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
+    if ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.spatial_layer_id == 0)) {
+      struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0);
+
+      if (buf != NULL) {
+        cpi->svc.empty_frame.ts_start = buf->ts_start;
+        cpi->svc.empty_frame.ts_end = buf->ts_end;
+        cpi->svc.encode_empty_frame_state = ENCODING;
+        cpi->common.show_frame = 0;
+        cpi->ref_frame_flags = 0;
+        cpi->common.frame_type = INTER_FRAME;
+        cpi->lst_fb_idx =
+            cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
+
+        if (cpi->svc.encode_intra_empty_frame != 0)
+          cpi->common.intra_only = 1;
+
+        width = SMALL_FRAME_WIDTH;
+        height = SMALL_FRAME_HEIGHT;
+      }
+    }
+  }
+
+  cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
+  cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
 
   vp9_change_config(cpi, &cpi->oxcf);
 
+  if (vp9_set_size_literal(cpi, width, height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
   vp9_set_high_precision_mv(cpi, 1);
 
-  cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source;
+  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
 
   return 0;
 }
 
-struct lookahead_entry *vp9_svc_lookahead_peek(VP9_COMP *const cpi,
-                                               struct lookahead_ctx *ctx,
-                                               int index, int copy_params) {
-  struct lookahead_entry *buf = vp9_lookahead_peek(ctx, index);
-
-  if (buf != NULL && copy_params != 0) {
-    if (copy_svc_params(cpi, buf) != 0)
-      return NULL;
-  }
-  return buf;
-}
+#endif
 
 struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
                                               struct lookahead_ctx *ctx,
                                               int drain) {
   struct lookahead_entry *buf = NULL;
-
   if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
-    buf = vp9_svc_lookahead_peek(cpi, ctx, 0, 1);
+    buf = vp9_lookahead_peek(ctx, 0);
     if (buf != NULL) {
-      // Only remove the buffer when pop the highest layer. Simply set the
-      // spatial_layer to -1 for lower layers.
-      buf->svc_params[cpi->svc.spatial_layer_id].spatial_layer = -1;
+      // Only remove the buffer when pop the highest layer.
       if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
         vp9_lookahead_pop(ctx, drain);
       }
     }
   }
-
   return buf;
 }
-#endif

diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index 801449b..b6a5ea5 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h

@@ -22,18 +22,25 @@
 typedef struct {
   RATE_CONTROL rc;
   int target_bandwidth;
+  int spatial_layer_target_bandwidth;  // Target for the spatial layer.
   double framerate;
   int avg_frame_size;
+  int max_q;
+  int min_q;
+  int scaling_factor_num;
+  int scaling_factor_den;
   TWO_PASS twopass;
-  struct vpx_fixed_buf rc_twopass_stats_in;
+  vpx_fixed_buf_t rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
   int is_key_frame;
-  vpx_svc_parameters_t svc_params_received;
+  int frames_from_key_frame;
+  FRAME_TYPE last_frame_type;
   struct lookahead_entry  *alt_ref_source;
   int alt_ref_idx;
   int gold_ref_idx;
   int has_alt_frame;
   size_t layer_size;
+  struct vpx_psnr_pkt psnr_pkt;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -42,14 +49,27 @@
   int number_spatial_layers;
   int number_temporal_layers;
 
+  int spatial_layer_to_encode;
+
+  // Workaround for multiple frame contexts
+  enum {
+    ENCODED = 0,
+    ENCODING,
+    NEED_TO_ENCODE
+  }encode_empty_frame_state;
+  struct lookahead_entry empty_frame;
+  int encode_intra_empty_frame;
+
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
 
   // Layer context used for rate control in one pass temporal CBR mode or
-  // two pass spatial mode. Defined for temporal or spatial layers for now.
-  // Does not support temporal combined with spatial RC.
-  LAYER_CONTEXT layer_context[MAX(VPX_TS_MAX_LAYERS, VPX_SS_MAX_LAYERS)];
+  // two pass spatial mode.
+  LAYER_CONTEXT layer_context[VPX_MAX_LAYERS];
+  // Indicates what sort of temporal layering is used.
+  // Currently, this only works for CBR mode.
+  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
 } SVC;
 
 struct VP9_COMP;
@@ -80,27 +100,20 @@
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 
 // Increment number of video frames in layer
-void vp9_inc_frame_in_layer(SVC *svc);
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
 
 // Check if current layer is key frame in spatial upper layer
 int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
 
-// Copy the source image, flags and svc parameters into a new framebuffer
-// with the expected stride/border
-int vp9_svc_lookahead_push(const struct VP9_COMP *const cpi,
-                           struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                           int64_t ts_start, int64_t ts_end,
-                           unsigned int flags);
-
 // Get the next source buffer to encode
 struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
                                               struct lookahead_ctx *ctx,
                                               int drain);
 
-// Get a future source buffer to encode
-struct lookahead_entry *vp9_svc_lookahead_peek(struct VP9_COMP *const cpi,
-                                               struct lookahead_ctx *ctx,
-                                               int index, int copy_params);
+// Start a frame and initialize svc parameters
+int vp9_svc_start_frame(struct VP9_COMP *const cpi);
+
+int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index ce3b311..439eac6 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c

@@ -15,7 +15,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -23,7 +22,9 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
 
@@ -44,7 +45,7 @@
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
   const InterpKernel *const kernel =
-    vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
+    vp9_filter_kernels[xd->mi[0]->mbmi.interp_filter];
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
@@ -56,6 +57,34 @@
     mv_precision_uv = MV_PRECISION_Q3;
   }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(y_mb_ptr, stride,
+                                     &pred[0], 16,
+                                     &mv,
+                                     scale,
+                                     16, 16,
+                                     which_mv,
+                                     kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride,
+                                     &pred[256], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride,
+                                     &pred[512], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
@@ -81,7 +110,7 @@
                             kernel, mv_precision_uv, x, y);
 }
 
-void vp9_temporal_filter_init() {
+void vp9_temporal_filter_init(void) {
   int i;
 
   fixed_divide[0] = 0;
@@ -133,11 +162,59 @@
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+                                        unsigned int stride,
+                                        uint8_t *frame2_8,
+                                        unsigned int block_width,
+                                        unsigned int block_height,
+                                        int strength,
+                                        int filter_weight,
+                                        unsigned int *accumulator,
+                                        uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier *= modifier;
+      modifier *= 3;
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
                                               int stride) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
   int step_param;
@@ -145,6 +222,7 @@
   int bestsme = INT_MAX;
   int distortion;
   unsigned int sse;
+  int cost_list[5];
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -168,6 +246,7 @@
 
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 cond_cost_list(cpi, cost_list),
                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -177,6 +256,7 @@
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
                                          NULL, NULL,
                                          &distortion, &sse, NULL, 0, 0);
 
@@ -188,6 +268,7 @@
 }
 
 static void temporal_filter_iterate_c(VP9_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
                                       int frame_count,
                                       int alt_ref_index,
                                       int strength,
@@ -196,22 +277,35 @@
   int frame;
   int mb_col, mb_row;
   unsigned int filter_weight;
-  int mb_cols = cpi->common.mb_cols;
-  int mb_rows = cpi->common.mb_rows;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
   int mb_y_offset = 0;
   int mb_uv_offset = 0;
-  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
-  MACROBLOCKD *mbd = &cpi->mb.e_mbd;
-  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t,  predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t,  predictor8[16 * 16 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t,  predictor[16 * 16 * 3]);
+#endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
 
   // Save input state
   uint8_t* input_buffer[MAX_MB_PLANE];
   int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; i++)
     input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -228,26 +322,26 @@
     //  8 - VP9_INTERP_EXTEND.
     // To keep the mv in play for both Y and UV planes the max that it
     //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+    cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
                          + (17 - 2 * VP9_INTERP_EXTEND);
 
     for (mb_col = 0; mb_col < mb_cols; mb_col++) {
       int i, j, k;
       int stride;
 
-      vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
-      vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
 
-      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+      cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+      cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
                            + (17 - 2 * VP9_INTERP_EXTEND);
 
       for (frame = 0; frame < frame_count; frame++) {
         const int thresh_low  = 10000;
         const int thresh_high = 20000;
 
-        if (cpi->frames[frame] == NULL)
+        if (frames[frame] == NULL)
           continue;
 
         mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
@@ -258,9 +352,9 @@
         } else {
           // Find best match in this frame by MC
           int err = temporal_filter_find_matching_mb_c(cpi,
-              cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->y_stride);
+              frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->y_stride);
 
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
@@ -272,16 +366,54 @@
         if (filter_weight != 0) {
           // Construct the predictors
           temporal_filter_predictors_mb_c(mbd,
-              cpi->frames[frame]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->u_buffer + mb_uv_offset,
-              cpi->frames[frame]->v_buffer + mb_uv_offset,
-              cpi->frames[frame]->y_stride,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset,
+              frames[frame]->y_stride,
               mb_uv_width, mb_uv_height,
               mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
               mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
               predictor, scale,
               mb_col * 16, mb_row * 16);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
+                                             f->y_stride,
+                                             predictor, 16, 16, adj_strength,
+                                             filter_weight,
+                                             accumulator, count);
+            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 256,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength,
+                                             filter_weight, accumulator + 256,
+                                             count + 256);
+            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 512,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength, filter_weight,
+                                             accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
+            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
+            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
+          }
+#else
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
@@ -297,9 +429,108 @@
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,
                                     count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1_16[byte] = (uint16_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1[byte] = (uint8_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
       // Normalize filter output to produce AltRef frame
       dst1 = cpi->alt_ref_buffer.y_buffer;
       stride = cpi->alt_ref_buffer.y_stride;
@@ -343,6 +574,7 @@
         }
         byte += stride - mb_uv_width;
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       mb_y_offset += 16;
       mb_uv_offset += mb_uv_width;
     }
@@ -357,12 +589,14 @@
 
 // Apply buffer limits and context specific adjustments to arnr filter.
 static void adjust_arnr_filter(VP9_COMP *cpi,
-                               int distance, int group_boost) {
+                               int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int frames_after_arf =
-            vp9_lookahead_depth(cpi->lookahead) - distance - 1;
+      vp9_lookahead_depth(cpi->lookahead) - distance - 1;
   int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
   int frames_bwd;
-  int q;
+  int q, frames, strength;
 
   // Define the forward and backwards filter limits for this arnr group.
   if (frames_fwd > frames_after_arf)
@@ -375,47 +609,52 @@
   // For even length filter there is one more frame backward
   // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
   if (frames_bwd < distance)
-    frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+    frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
 
   // Set the baseline active filter size.
-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+  frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q.
   if (cpi->common.current_video_frame > 1)
     q = ((int)vp9_convert_qindex_to_q(
-        cpi->rc.avg_frame_qindex[INTER_FRAME]));
+        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth));
   else
     q = ((int)vp9_convert_qindex_to_q(
-        cpi->rc.avg_frame_qindex[KEY_FRAME]));
+        cpi->rc.avg_frame_qindex[KEY_FRAME], cpi->common.bit_depth));
   if (q > 16) {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
+    strength = oxcf->arnr_strength;
   } else {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2);
-    if (cpi->active_arnr_strength < 0)
-      cpi->active_arnr_strength = 0;
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0)
+      strength = 0;
   }
 
   // Adjust number of frames in filter and strength based on gf boost level.
-  if (cpi->active_arnr_frames > (group_boost / 150)) {
-    cpi->active_arnr_frames = (group_boost / 150);
-    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
   }
-  if (cpi->active_arnr_strength > (group_boost / 300)) {
-    cpi->active_arnr_strength = (group_boost / 300);
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
   }
 
   // Adjustments for second level arf in multi arf case.
   if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
-      cpi->active_arnr_strength >>= 1;
+      strength >>= 1;
     }
   }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
 }
 
 void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int frame;
   int frames_to_blur;
   int start_frame;
@@ -423,59 +662,88 @@
   int frames_to_blur_backward;
   int frames_to_blur_forward;
   struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
 
   // Apply context specific adjustments to the arnr filter parameters.
-  adjust_arnr_filter(cpi, distance, rc->gfu_boost);
-  strength = cpi->active_arnr_strength;
-  frames_to_blur = cpi->active_arnr_frames;
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
   start_frame = distance + frames_to_blur_forward;
 
   // Setup frame pointers, NULL indicates frame not included in filter.
-  vp9_zero(cpi->frames);
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
                                                      which_buffer);
-    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+    frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
-  // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  if (is_spatial_svc(cpi)) {
-    // In spatial svc the scaling factors might be less then 1/2. So we will use
-    // non-normative scaling.
-    int frame_used = 0;
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height);
+  if (frames_to_blur > 0) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    if (cpi->use_svc) {
+      // In spatial svc the scaling factors might be less then 1/2.
+      // So we will use non-normative scaling.
+      int frame_used = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          cm->use_highbitdepth);
+#else
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    for (frame = 0; frame < frames_to_blur; ++frame) {
-      if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width ||
-          cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) {
-        if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
-                                     cm->width, cm->height,
-                                     cm->subsampling_x, cm->subsampling_y,
-                                     VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
-                                     NULL))
-          vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to reallocate alt_ref_buffer");
-
-        cpi->frames[frame] =
-            vp9_scale_if_required(cm, cpi->frames[frame],
-                                  &cpi->svc.scaled_frames[frame_used]);
-        ++frame_used;
+      for (frame = 0; frame < frames_to_blur; ++frame) {
+        if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+            cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
+          if (vpx_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
+                                       cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL)) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to reallocate alt_ref_buffer");
+          }
+          frames[frame] = vp9_scale_if_required(
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used]);
+          ++frame_used;
+        }
       }
+      cm->mi = cm->mip + cm->mi_stride + 1;
+      xd->mi = cm->mi_grid_visible;
+      xd->mi[0] = cm->mi;
+    } else {
+      // ARF is produced at the native frame size and resized when coded.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        cm->use_highbitdepth);
+#else
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-  } else {
-    vp9_setup_scale_factors_for_frame(&sf,
-                                      get_frame_new_buffer(cm)->y_crop_width,
-                                      get_frame_new_buffer(cm)->y_crop_height,
-                                      cm->width, cm->height);
   }
 
-  temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
-                            strength, &sf);
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
 }

diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.h b/libvpx/vp9/encoder/vp9_temporal_filter.h
index a971e0a..f537b88 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.h
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.h

@@ -15,7 +15,7 @@
 extern "C" {
 #endif
 
-void vp9_temporal_filter_init();
+void vp9_temporal_filter_init(void);
 void vp9_temporal_filter(VP9_COMP *cpi, int distance);
 
 #ifdef __cplusplus

diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 6068b85..85cb2fc 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c

@@ -17,19 +17,42 @@
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr;
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+  {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+  {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+  {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+  {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+  {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+  {8, 7}, {8, 5}, {8, 3}, {8, 1},
+  {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+  {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+  {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+  {1, 0},  {2, 0}, {3, 0}, {4, 0},
+  {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+  {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+  {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+  {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+  {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+  {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+  {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+  {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+  {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+  {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -EOB_TOKEN, 2,                       // 0  = EOB
   -ZERO_TOKEN, 4,                      // 1  = ZERO
   -ONE_TOKEN, 6,                       // 2  = ONE
@@ -43,123 +66,390 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                // 0 = LOW_VAL
-  -TWO_TOKEN, 4,                       // 1 = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
-  8, 10,                               // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12, 14,                              // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+static const vpx_tree_index cat1[2] = {0, 0};
+static const vpx_tree_index cat2[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+    14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
+
+static const int16_t zero_cost[] = {0};
+static const int16_t one_cost[] = {255, 257};
+static const int16_t two_cost[] = {255, 257};
+static const int16_t three_cost[] = {255, 257};
+static const int16_t four_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+  1289, 1291
+};
+static const int16_t cat4_cost[] = {
+  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+  1269, 1271, 1283, 1285, 1306, 1308, 1320,
+  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp9_cat6_low_cost[256] = {
+  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp9_cat6_high_cost[128] = {
+  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
 };
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+#if CONFIG_VP9_HIGHBITDEPTH
+const int16_t vp9_cat6_high10_high_cost[512] = {
+  74, 894, 1185, 2005, 1450, 2270, 2561,
+  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp9_cat6_high12_high_cost[2048] = {
+  76, 896, 1187, 2007, 1452, 2272, 2563,
+  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
+#endif
 
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
-}
+#if CONFIG_VP9_HIGHBITDEPTH
+static const vpx_tree_index cat1_high10[2] = {0, 0};
+static const vpx_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 0, 0};
+static const vpx_tree_index cat1_high12[2] = {0, 0};
+static const vpx_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 32, 32, 34, 34, 0, 0};
+#endif
 
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                              // ZERO_TOKEN
-  {0, 0, 0, 1},                              // ONE_TOKEN
-  {0, 0, 0, 2},                              // TWO_TOKEN
-  {0, 0, 0, 3},                              // THREE_TOKEN
-  {0, 0, 0, 4},                              // FOUR_TOKEN
-  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                               // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                             // ZERO_TOKEN
+  {0, 0, 0, 1, one_cost},                              // ONE_TOKEN
+  {0, 0, 0, 2, two_cost},                              // TWO_TOKEN
+  {0, 0, 0, 3, three_cost},                            // THREE_TOKEN
+  {0, 0, 0, 4, four_cost},                             // FOUR_TOKEN
+  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, 0, zero_cost}                              // EOB_TOKEN
 };
 
-struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
+};
+const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
+};
+#endif
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
+const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
+  {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
+  {125, 7}, {126, 7}, {127, 7}, {0, 1}
+};
 
-void vp9_tokenize_initialize() {
-  TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
-  const vp9_extra_bit *const e = vp9_extra_bits;
-
-  int i = -DCT_MAX_VALUE;
-  int sign = 1;
-
-  do {
-    if (!i)
-      sign = 0;
-
-    {
-      const int a = sign ? -i : i;
-      int eb = sign;
-
-      if (a > 4) {
-        int j = 4;
-
-        while (++j < 11  &&  e[j].base_val <= a) {}
-
-        t[i].token = --j;
-        eb |= (a - e[j].base_val) << 1;
-      } else {
-        t[i].token = a;
-      }
-      t[i].extra = eb;
-    }
-
-    // initialize the cost for extra bits for all possible coefficient value.
-    {
-      int cost = 0;
-      const vp9_extra_bit *p = &vp9_extra_bits[t[i].token];
-
-      if (p->base_val) {
-        const int extra = t[i].extra;
-        const int length = p->len;
-
-        if (length)
-          cost += treed_cost(p->tree, p->prob, extra >> 1, length);
-
-        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
-        dct_value_cost[i + DCT_MAX_VALUE] = cost;
-      }
-    }
-  } while (++i < DCT_MAX_VALUE);
-
-  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-}
 
 struct tokenize_b_args {
   VP9_COMP *cpi;
-  MACROBLOCKD *xd;
+  ThreadData *td;
   TOKENEXTRA **tp;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
-  MACROBLOCKD *const xd = args->xd;
-  struct macroblock_plane *p = &args->cpi->mb.plane[plane];
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -167,8 +457,8 @@
                    aoff, loff);
 }
 
-static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
-                             int16_t extra, uint8_t token,
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
+                             int32_t extra, uint8_t token,
                              uint8_t skip_eob_node,
                              unsigned int *counts) {
   (*t)->token = token;
@@ -180,7 +470,7 @@
 }
 
 static INLINE void add_token_no_extra(TOKENEXTRA **t,
-                                      const vp9_prob *context_tree,
+                                      const vpx_prob *context_tree,
                                       uint8_t token,
                                       uint8_t skip_eob_node,
                                       unsigned int *counts) {
@@ -194,17 +484,19 @@
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
   const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
-  MACROBLOCKD *xd = args->xd;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   TOKENEXTRA **tp = args->tp;
   uint8_t token_cache[32 * 32];
-  struct macroblock_plane *p = &cpi->mb.plane[plane];
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int pt; /* near block/prev token context index */
@@ -212,20 +504,21 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      cpi->coef_counts[tx_size][type][ref];
-  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc.coef_probs[tx_size][type][ref];
+      td->rd_counts.coef_counts[tx_size][type][ref];
+  vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[tx_size][type][ref];
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
-      cpi->common.counts.eob_branch[tx_size][type][ref];
+      td->counts->eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-
+  int16_t token;
+  EXTRABIT extra;
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
@@ -235,6 +528,7 @@
   scan = so->scan;
   nb = so->neighbors;
   c = 0;
+
   while (c < eob) {
     int v = 0;
     int skip_eob = 0;
@@ -252,15 +546,13 @@
       v = qcoeff[scan[c]];
     }
 
-    add_token(&t, coef_probs[band[c]][pt],
-              vp9_dct_value_tokens_ptr[v].extra,
-              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
-              (uint8_t)skip_eob,
-              counts[band[c]][pt]);
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 
-    token_cache[scan[c]] =
-        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
   }
@@ -276,52 +568,69 @@
 }
 
 struct is_skippable_args {
-  MACROBLOCK *x;
+  uint16_t *eobs;
   int *skippable;
 };
 static void is_skippable(int plane, int block,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
+  (void)plane;
   (void)plane_bsize;
   (void)tx_size;
-  args->skippable[0] &= (!args->x->plane[plane].eobs[block]);
+  args->skippable[0] &= (!args->eobs[block]);
 }
 
 // TODO(yaowu): rewrite and optimize this function to remove the usage of
 //              vp9_foreach_transform_block() and simplify is_skippable().
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   int result = 1;
-  struct is_skippable_args args = {x, &result};
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
   vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
                                          &args);
   return result;
 }
 
-void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+static void has_high_freq_coeff(int plane, int block,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                void *argv) {
+  struct is_skippable_args *args = argv;
+  int eobs = (tx_size == TX_4X4) ? 3 : 10;
+  (void) plane;
+  (void) plane_bsize;
+
+  *(args->skippable) |= (args->eobs[block] > eobs);
+}
+
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 0;
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
+                                         has_high_freq_coeff, &args);
+  return result;
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     int dry_run, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  TOKENEXTRA *t_backup = *t;
   const int ctx = vp9_get_skip_context(xd);
-  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                              SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t};
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
   if (mbmi->skip) {
     if (!dry_run)
-      cm->counts.skip[ctx][1] += skip_inc;
+      td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
-    if (dry_run)
-      *t = t_backup;
     return;
   }
 
   if (!dry_run) {
-    cm->counts.skip[ctx][0] += skip_inc;
+    td->counts->skip[ctx][0] += skip_inc;
     vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
     vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
-    *t = t_backup;
   }
 }

diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index 063c0ba..11b78ba 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h

@@ -20,32 +20,39 @@
 extern "C" {
 #endif
 
-void vp9_tokenize_initialize();
-
 #define EOSB_TOKEN 127     // Not signalled, encoder only
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  typedef int32_t EXTRABIT;
+#else
+  typedef int16_t EXTRABIT;
+#endif
+
+
 typedef struct {
   int16_t token;
-  int16_t extra;
+  EXTRABIT extra;
 } TOKENVALUE;
 
 typedef struct {
-  const vp9_prob *context_tree;
-  int16_t         extra;
-  uint8_t         token;
-  uint8_t         skip_eob_node;
+  const vpx_prob *context_tree;
+  EXTRABIT extra;
+  uint8_t token;
+  uint8_t skip_eob_node;
 } TOKENEXTRA;
 
-extern const vp9_tree_index vp9_coef_tree[];
-extern const vp9_tree_index vp9_coef_con_tree[];
-extern struct vp9_token vp9_coef_encodings[];
+extern const vpx_tree_index vp9_coef_tree[];
+extern const vpx_tree_index vp9_coef_con_tree[];
+extern const struct vp9_token vp9_coef_encodings[];
 
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 struct VP9_COMP;
+struct ThreadData;
 
-void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
 
 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
@@ -53,6 +60,50 @@
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int16_t vp9_cat6_high_cost[128];
+extern const int16_t vp9_cat6_high10_high_cost[512];
+extern const int16_t vp9_cat6_high12_high_cost[2048];
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
+                                   const int16_t *cat6_high_table) {
+  if (token != CATEGORY6_TOKEN)
+    return vp9_extra_bits[token].cost[extrabits];
+  return vp9_cat6_low_cost[extrabits & 0xff]
+      + cat6_high_table[extrabits >> 8];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  return bit_depth == 8 ? vp9_cat6_high_cost
+      : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+         vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  (void) bit_depth;
+  return vp9_cat6_high_cost;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+    return 10;
+  return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vp9/encoder/vp9_treewriter.c b/libvpx/vp9/encoder/vp9_treewriter.c
index bb04b40..0fc078e 100644
--- a/libvpx/vp9/encoder/vp9_treewriter.c
+++ b/libvpx/vp9/encoder/vp9_treewriter.c

@@ -10,13 +10,13 @@
 
 #include "vp9/encoder/vp9_treewriter.h"
 
-static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree,
+static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree,
                      int i, int v, int l) {
   v += v;
   ++l;
 
   do {
-    const vp9_tree_index j = tree[i++];
+    const vpx_tree_index j = tree[i++];
     if (j <= 0) {
       tokens[-j].value = v;
       tokens[-j].len = l;
@@ -27,11 +27,11 @@
 }
 
 void vp9_tokens_from_tree(struct vp9_token *tokens,
-                          const vp9_tree_index *tree) {
+                          const vpx_tree_index *tree) {
   tree2tok(tokens, tree, 0, 0, 0);
 }
 
-static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
                                          unsigned int branch_ct[][2],
                                          const unsigned int num_events[]) {
   unsigned int left, right;
@@ -51,7 +51,7 @@
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */]) {
   convert_distribution(0, tree, branch_ct, num_events);

diff --git a/libvpx/vp9/encoder/vp9_treewriter.h b/libvpx/vp9/encoder/vp9_treewriter.h
index 4a76d87..0f89350 100644
--- a/libvpx/vp9/encoder/vp9_treewriter.h
+++ b/libvpx/vp9/encoder/vp9_treewriter.h

@@ -11,13 +11,13 @@
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ]);
 
@@ -26,20 +26,20 @@
   int len;
 };
 
-void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *);
+void vp9_tokens_from_tree(struct vp9_token*, const vpx_tree_index *);
 
-static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree,
-                                  const vp9_prob *probs, int bits, int len,
-                                  vp9_tree_index i) {
+static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+                                  const vpx_prob *probs, int bits, int len,
+                                  vpx_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
-    vp9_write(w, bit, probs[i >> 1]);
+    vpx_write(w, bit, probs[i >> 1]);
     i = tree[i + bit];
   } while (len);
 }
 
-static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree,
-                                   const vp9_prob *probs,
+static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
+                                   const vpx_prob *probs,
                                    const struct vp9_token *token) {
   vp9_write_tree(w, tree, probs, token->value, token->len, 0);
 }

diff --git a/libvpx/vp9/encoder/vp9_variance.c b/libvpx/vp9/encoder/vp9_variance.c
deleted file mode 100644
index eb5ae2e..0000000
--- a/libvpx/vp9/encoder/vp9_variance.c
+++ /dev/null

@@ -1,268 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-void variance(const uint8_t *a, int  a_stride,
-              const uint8_t *b, int  b_stride,
-              int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// first-pass of 2-D separable filter.
-//
-// Produces int32_t output to retain precision for next pass. Two filter taps
-// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
-// defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// second-pass of 2-D separable filter.
-//
-// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
-// stride). It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const int16_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; i++)
-    sum += src_ptr[i] * src_ptr[i];
-
-  return sum;
-}
-
-#define VAR(W, H) \
-unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                       const uint8_t *b, int b_stride, \
-                                       unsigned int *sse) { \
-  int sum; \
-  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
-#define SUBPIX_VAR(W, H) \
-unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    BILINEAR_FILTERS_2TAP(xoffset)); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
-}
-
-#define SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    BILINEAR_FILTERS_2TAP(xoffset)); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     BILINEAR_FILTERS_2TAP(yoffset)); \
-\
-  vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
-  return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
-}
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
-                       const uint8_t *ref_ptr, int ref_stride,
-                       unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
-}
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
-                     const uint8_t *ref_ptr, int ref_stride,
-                     unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
-                            const uint8_t *ref, int ref_stride,
-                            unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
-  return *sse;
-}
-
-VAR(4, 4)
-SUBPIX_VAR(4, 4)
-SUBPIX_AVG_VAR(4, 4)
-
-VAR(4, 8)
-SUBPIX_VAR(4, 8)
-SUBPIX_AVG_VAR(4, 8)
-
-VAR(8, 4)
-SUBPIX_VAR(8, 4)
-SUBPIX_AVG_VAR(8, 4)
-
-VAR(8, 8)
-SUBPIX_VAR(8, 8)
-SUBPIX_AVG_VAR(8, 8)
-
-VAR(8, 16)
-SUBPIX_VAR(8, 16)
-SUBPIX_AVG_VAR(8, 16)
-
-VAR(16, 8)
-SUBPIX_VAR(16, 8)
-SUBPIX_AVG_VAR(16, 8)
-
-VAR(16, 16)
-SUBPIX_VAR(16, 16)
-SUBPIX_AVG_VAR(16, 16)
-
-VAR(16, 32)
-SUBPIX_VAR(16, 32)
-SUBPIX_AVG_VAR(16, 32)
-
-VAR(32, 16)
-SUBPIX_VAR(32, 16)
-SUBPIX_AVG_VAR(32, 16)
-
-VAR(32, 32)
-SUBPIX_VAR(32, 32)
-SUBPIX_AVG_VAR(32, 32)
-
-VAR(32, 64)
-SUBPIX_VAR(32, 64)
-SUBPIX_AVG_VAR(32, 64)
-
-VAR(64, 32)
-SUBPIX_VAR(64, 32)
-SUBPIX_AVG_VAR(64, 32)
-
-VAR(64, 64)
-SUBPIX_VAR(64, 64)
-SUBPIX_AVG_VAR(64, 64)
-
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}

diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
deleted file mode 100644
index 4a194b7..0000000
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_VARIANCE_H_
-#define VP9_ENCODER_VP9_VARIANCE_H_
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void variance(const uint8_t *a, int a_stride,
-              const uint8_t *b, int b_stride,
-              int  w, int  h,
-              unsigned int *sse, int *sum);
-
-typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
-                                    int source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int ref_stride);
-
-typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
-                                        int source_stride,
-                                        const uint8_t *ref_ptr,
-                                        int ref_stride,
-                                        const uint8_t *second_pred);
-
-typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
-                                   int source_stride,
-                                   const uint8_t *ref_ptr,
-                                   int  ref_stride,
-                                   unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
-                                     int source_stride,
-                                     const uint8_t* const ref_ptr[],
-                                     int  ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
-                                          int source_stride,
-                                          const uint8_t *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
-                                                int source_stride,
-                                                int xoffset,
-                                                int yoffset,
-                                                const uint8_t *ref_ptr,
-                                                int Refstride,
-                                                unsigned int *sse);
-
-typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
-                                                   int source_stride,
-                                                   int xoffset,
-                                                   int yoffset,
-                                                   const uint8_t *ref_ptr,
-                                                   int Refstride,
-                                                   unsigned int *sse,
-                                                   const uint8_t *second_pred);
-
-typedef struct vp9_variance_vtable {
-  vp9_sad_fn_t               sdf;
-  vp9_sad_avg_fn_t           sdaf;
-  vp9_variance_fn_t          vf;
-  vp9_subpixvariance_fn_t    svf;
-  vp9_subp_avg_variance_fn_t svaf;
-  vp9_sad_multi_fn_t         sdx3f;
-  vp9_sad_multi_fn_t         sdx8f;
-  vp9_sad_multi_d_fn_t       sdx4df;
-} vp9_variance_fn_ptr_t;
-
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_VARIANCE_H_

diff --git a/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
new file mode 100644
index 0000000..4531d79
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -0,0 +1,424 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_ports/mem.h"
+
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0  = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+                           int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    _mm_store_si128((__m128i *)coeff, coeff0);
+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+  }
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+  int i;
+  __m128i sum = _mm_load_si128((const __m128i *)coeff);
+  __m128i sign = _mm_srai_epi16(sum, 15);
+  __m128i val = _mm_xor_si128(sum, sign);
+  sum = _mm_sub_epi16(val, sign);
+  coeff += 8;
+
+  for (i = 8; i < length; i += 8) {
+    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    sign = _mm_srai_epi16(src_line, 15);
+    val = _mm_xor_si128(src_line, sign);
+    val = _mm_sub_epi16(val, sign);
+    sum = _mm_add_epi16(sum, val);
+    coeff += 8;
+  }
+
+  val = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, val);
+
+  return _mm_extract_epi16(sum, 0);
+}
+
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+                          const int ref_stride, const int height) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+
+  for (idx = 1; idx < height_1; idx += 2) {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+  }
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+
+  if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_load_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
+
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+                        const int bwl) {
+  int idx;
+  int width = 4 << bwl;
+  int16_t mean;
+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+  __m128i v1 = _mm_load_si128((const __m128i *)src);
+  __m128i diff = _mm_subs_epi16(v0, v1);
+  __m128i sum = diff;
+  __m128i sse = _mm_madd_epi16(diff, diff);
+
+  ref += 8;
+  src += 8;
+
+  for (idx = 8; idx < width; idx += 8) {
+    v0 = _mm_loadu_si128((const __m128i *)ref);
+    v1 = _mm_load_si128((const __m128i *)src);
+    diff = _mm_subs_epi16(v0, v1);
+
+    sum = _mm_add_epi16(sum, diff);
+    v0  = _mm_madd_epi16(diff, diff);
+    sse = _mm_add_epi32(sse, v0);
+
+    ref += 8;
+    src += 8;
+  }
+
+  v0  = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, v0);
+
+  v1  = _mm_srli_si128(sse, 8);
+  sse = _mm_add_epi32(sse, v1);
+  v1  = _mm_srli_epi64(sse, 32);
+  sse = _mm_add_epi32(sse, v1);
+
+  mean = _mm_extract_epi16(sum, 0);
+
+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
index f71181c..7a7a6b6 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm

@@ -7,6 +7,9 @@
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
+
+%define private_prefix vp9
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
@@ -62,9 +65,40 @@
   psllw           m2,        2
   psllw           m3,        2
 
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
   movq            [outputq],      m0
   movq            [outputq + 8],  m1
   movq            [outputq + 16], m2
   movq            [outputq + 24], m3
+%endif
 
   RET

diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index 487deef..fa37b6f 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

@@ -8,239 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>  // SSE2
-#include "vp9/common/vp9_idct.h"  // for cospi constants
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
-void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
-         (input +  2 * stride)));
-  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
-         (input +  3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  _mm_store_si128((__m128i *)(output), in0);
-}
-
-void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            -cospi_24_64, cospi_8_64,
-                                            -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
-                                               +(DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-
-  // Load inputs.
-  {
-    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-    in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
-           (input +  2 * stride)));
-    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
-           (input +  3 * stride)));
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-
-
-    // multiply by 16 to give some extra precision
-    in0 = _mm_slli_epi16(in0, 4);
-    in1 = _mm_slli_epi16(in1, 4);
-    // if (i == 0 && input[0]) input[0] += 1;
-    // add 1 to the upper left pixel if it is non-zero, which helps reduce
-    // the round-trip error
-    {
-      // The mask will only contain whether the first value is zero, all
-      // other comparison will fail as something shifted by 4 (above << 4)
-      // can never be equal to one. To increment in the non-zero case, we
-      // add the mask and one for the first element:
-      //   - if zero, mask = -1, v = v - 1 + 1 = v
-      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-      in0 = _mm_add_epi16(in0, mask);
-      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-    }
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    const __m128i t0 = _mm_add_epi16(in0, in1);
-    const __m128i t1 = _mm_sub_epi16(in0, in1);
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-    // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-    // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-    // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-    const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-    const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-    // Then add and right-shift to get back to 16-bit range
-    // but this combines the final right-shift as well to save operations
-    // This unusual rounding operations is to maintain bit-accurate
-    // compatibility with the c version of this function which has two
-    // rounding steps in a row.
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-    // w0 = [o0 o4 o8 oC]
-    // w1 = [o2 o6 oA oE]
-    // w2 = [o1 o5 o9 oD]
-    // w3 = [o3 o7 oB oF]
-    // remember the o's are numbered according to the correct output location
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-    // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-    // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-    const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-    const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-    // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-    // y1 = [o2 o3 o6 o7 oA oB oE oF]
-    in0 = _mm_unpacklo_epi32(y0, y1);
-    // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-    in1 = _mm_unpackhi_epi32(y0, y1);
-    // in1 = [o8 o9 oA oB oC oD oE oF]
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  {
-     _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
-     _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
-  }
-}
-
-
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
@@ -262,7 +39,7 @@
   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
 }
 
-static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
   const __m128i kOne = _mm_set1_epi16(1);
   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
@@ -270,8 +47,8 @@
   __m128i out23 = _mm_add_epi16(in23, kOne);
   out01 = _mm_srai_epi16(out01, 2);
   out23 = _mm_srai_epi16(out23, 2);
-  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
-  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
 }
 
 static INLINE void transpose_4x4(__m128i *res) {
@@ -293,8 +70,8 @@
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+static void fdct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -326,12 +103,12 @@
   transpose_4x4(in);
 }
 
-void fadst4_sse2(__m128i *in) {
+static void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
   const __m128i kZero = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
@@ -374,13 +151,13 @@
   transpose_4x4(in);
 }
 
-void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[4];
 
   switch (tx_type) {
     case DCT_DCT:
-      vp9_fdct4x4_sse2(input, output, stride);
+      vpx_fdct4x4_sse2(input, output, stride);
       break;
     case ADST_DCT:
       load_buffer_4x4(input, in, stride);
@@ -406,53 +183,22 @@
   }
 }
 
-void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
-  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0  = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  _mm_store_si128((__m128i *)(output), in1);
-}
-
-void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                            int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                            uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  __m128i zero;
   int pass;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -470,6 +216,14 @@
   __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
   __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
   __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
   // Pre-condition input (shift by two)
   in0 = _mm_slli_epi16(in0, 2);
   in1 = _mm_slli_epi16(in1, 2);
@@ -480,6 +234,15 @@
   in6 = _mm_slli_epi16(in6, 2);
   in7 = _mm_slli_epi16(in7, 2);
 
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
   // We do two passes, first the columns, then the rows. The results of the
   // first pass are transposed so that the same column code can be reused. The
   // results of the second pass are also transposed so that the rows (processed
@@ -690,15 +453,175 @@
     in5 = _mm_srai_epi16(in5, 1);
     in6 = _mm_srai_epi16(in6, 1);
     in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
-    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
-    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
-    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
-    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
-    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
-    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
-    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
   }
 }
 
@@ -725,9 +648,7 @@
 }
 
 // right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, int const bit) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  const int bit_m02 = bit - 2;
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -737,16 +658,16 @@
   __m128i sign6 = _mm_srai_epi16(res[6], 15);
   __m128i sign7 = _mm_srai_epi16(res[7], 15);
 
-  if (bit_m02 >= 0) {
-    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
-    res[0] = _mm_add_epi16(res[0], k_const_rounding);
-    res[1] = _mm_add_epi16(res[1], k_const_rounding);
-    res[2] = _mm_add_epi16(res[2], k_const_rounding);
-    res[3] = _mm_add_epi16(res[3], k_const_rounding);
-    res[4] = _mm_add_epi16(res[4], k_const_rounding);
-    res[5] = _mm_add_epi16(res[5], k_const_rounding);
-    res[6] = _mm_add_epi16(res[6], k_const_rounding);
-    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
   }
 
   res[0] = _mm_sub_epi16(res[0], sign0);
@@ -758,26 +679,38 @@
   res[6] = _mm_sub_epi16(res[6], sign6);
   res[7] = _mm_sub_epi16(res[7], sign7);
 
-  res[0] = _mm_srai_epi16(res[0], bit);
-  res[1] = _mm_srai_epi16(res[1], bit);
-  res[2] = _mm_srai_epi16(res[2], bit);
-  res[3] = _mm_srai_epi16(res[3], bit);
-  res[4] = _mm_srai_epi16(res[4], bit);
-  res[5] = _mm_srai_epi16(res[5], bit);
-  res[6] = _mm_srai_epi16(res[6], bit);
-  res[7] = _mm_srai_epi16(res[7], bit);
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
 }
 
 // write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
-  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
-  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
-  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
-  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
-  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
-  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
-  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
-  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
 }
 
 // perform in-place transpose
@@ -832,9 +765,9 @@
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_sse2(__m128i *in) {
+static void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -972,7 +905,7 @@
   array_transpose_8x8(in, in);
 }
 
-void fadst8_sse2(__m128i *in) {
+static void fadst8_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -986,7 +919,7 @@
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -1202,13 +1135,13 @@
   array_transpose_8x8(in, in);
 }
 
-void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
 
   switch (tx_type) {
     case DCT_DCT:
-      vp9_fdct8x8_sse2(input, output, stride);
+      vpx_fdct8x8_sse2(input, output, stride);
       break;
     case ADST_DCT:
       load_buffer_8x8(input, in, stride);
@@ -1237,699 +1170,6 @@
   }
 }
 
-void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
-
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0  = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 1);
-  _mm_store_si128((__m128i *)(output), in1);
-}
-
-void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
-  const int16_t *in = input;
-  int16_t *out = intermediate;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = _mm_add_epi16(in00, in15);
-        input1 = _mm_add_epi16(in01, in14);
-        input2 = _mm_add_epi16(in02, in13);
-        input3 = _mm_add_epi16(in03, in12);
-        input4 = _mm_add_epi16(in04, in11);
-        input5 = _mm_add_epi16(in05, in10);
-        input6 = _mm_add_epi16(in06, in09);
-        input7 = _mm_add_epi16(in07, in08);
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = _mm_sub_epi16(in07, in08);
-        step1_1 = _mm_sub_epi16(in06, in09);
-        step1_2 = _mm_sub_epi16(in05, in10);
-        step1_3 = _mm_sub_epi16(in04, in11);
-        step1_4 = _mm_sub_epi16(in03, in12);
-        step1_5 = _mm_sub_epi16(in02, in13);
-        step1_6 = _mm_sub_epi16(in01, in14);
-        step1_7 = _mm_sub_epi16(in00, in15);
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = _mm_add_epi16(input0, input7);
-        const __m128i q1 = _mm_add_epi16(input1, input6);
-        const __m128i q2 = _mm_add_epi16(input2, input5);
-        const __m128i q3 = _mm_add_epi16(input3, input4);
-        const __m128i q4 = _mm_sub_epi16(input3, input4);
-        const __m128i q5 = _mm_sub_epi16(input2, input5);
-        const __m128i q6 = _mm_sub_epi16(input1, input6);
-        const __m128i q7 = _mm_sub_epi16(input0, input7);
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = _mm_add_epi16(q0, q3);
-          const __m128i r1 = _mm_add_epi16(q1, q2);
-          const __m128i r2 = _mm_sub_epi16(q1, q2);
-          const __m128i r3 = _mm_sub_epi16(q0, q3);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res00 = _mm_packs_epi32(w0, w1);
-          res08 = _mm_packs_epi32(w2, w3);
-          res04 = _mm_packs_epi32(w4, w5);
-          res12 = _mm_packs_epi32(w6, w7);
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-          // Combine
-          const __m128i r0 = _mm_packs_epi32(s0, s1);
-          const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/subtract
-          const __m128i x0 = _mm_add_epi16(q4, r0);
-          const __m128i x1 = _mm_sub_epi16(q4, r0);
-          const __m128i x2 = _mm_sub_epi16(q7, r1);
-          const __m128i x3 = _mm_add_epi16(q7, r1);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res02 = _mm_packs_epi32(w0, w1);
-          res14 = _mm_packs_epi32(w2, w3);
-          res10 = _mm_packs_epi32(w4, w5);
-          res06 = _mm_packs_epi32(w6, w7);
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_2 = _mm_packs_epi32(w0, w1);
-          step2_3 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_5 = _mm_packs_epi32(w0, w1);
-          step2_4 = _mm_packs_epi32(w2, w3);
-        }
-        // step 3
-        {
-          step3_0 = _mm_add_epi16(step1_0, step2_3);
-          step3_1 = _mm_add_epi16(step1_1, step2_2);
-          step3_2 = _mm_sub_epi16(step1_1, step2_2);
-          step3_3 = _mm_sub_epi16(step1_0, step2_3);
-          step3_4 = _mm_sub_epi16(step1_7, step2_4);
-          step3_5 = _mm_sub_epi16(step1_6, step2_5);
-          step3_6 = _mm_add_epi16(step1_6, step2_5);
-          step3_7 = _mm_add_epi16(step1_7, step2_4);
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_1 = _mm_packs_epi32(w0, w1);
-          step2_2 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_6 = _mm_packs_epi32(w0, w1);
-          step2_5 = _mm_packs_epi32(w2, w3);
-        }
-        // step 5
-        {
-          step1_0 = _mm_add_epi16(step3_0, step2_1);
-          step1_1 = _mm_sub_epi16(step3_0, step2_1);
-          step1_2 = _mm_add_epi16(step3_3, step2_2);
-          step1_3 = _mm_sub_epi16(step3_3, step2_2);
-          step1_4 = _mm_sub_epi16(step3_4, step2_5);
-          step1_5 = _mm_add_epi16(step3_4, step2_5);
-          step1_6 = _mm_sub_epi16(step3_7, step2_6);
-          step1_7 = _mm_add_epi16(step3_7, step2_6);
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res01 = _mm_packs_epi32(w0, w1);
-          res09 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res05 = _mm_packs_epi32(w0, w1);
-          res13 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res11 = _mm_packs_epi32(w0, w1);
-          res03 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res15 = _mm_packs_epi32(w0, w1);
-          res07 = _mm_packs_epi32(w2, w3);
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
-        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
-        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
-        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
-        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
-        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
-        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
-        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
-      }
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        // Store results
-        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
-        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
-        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
-        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
-        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
-        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
-        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
-        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
-      }
-      out += 8*16;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
-}
-
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
                                      __m128i *in1, int stride) {
   // load first 8 columns
@@ -1942,7 +1182,7 @@
   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
                                       __m128i *in1, int stride) {
   // write first 8 columns
   write_buffer_8x8(output, in0, stride);
@@ -1978,10 +1218,10 @@
   right_shift_8x8(res1 + 8, 2);
 }
 
-void fdct16_8col(__m128i *in) {
+static void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -2300,7 +1540,7 @@
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_8col(__m128i *in) {
+static void fadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2328,8 +1568,8 @@
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2770,25 +2010,25 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_sse2(__m128i *in0, __m128i *in1) {
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void fadst16_sse2(__m128i *in0, __m128i *in1) {
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
   fadst16_8col(in0);
   fadst16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
 
   switch (tx_type) {
     case DCT_DCT:
-      vp9_fdct16x16_sse2(input, output, stride);
+      vpx_fdct16x16_sse2(input, output, stride);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, in0, in1, stride);
@@ -2816,86 +2056,3 @@
       break;
   }
 }
-
-void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 8; ++i) {
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0  = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 3);
-  _mm_store_si128((__m128i *)(output), in1);
-}
-
-#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D vp9_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION

diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
new file mode 100644
index 0000000..b09eac0
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c

@@ -0,0 +1,472 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+                             int16_t* coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t* zbin_ptr,
+                             const int16_t* round_ptr, const int16_t* quant_ptr,
+                             const int16_t* quant_shift_ptr,
+                             int16_t* qcoeff_ptr,
+                             int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                             uint16_t* eob_ptr,
+                             const int16_t* scan_ptr,
+                             const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_sub_epi16(q6, q5);
+      const __m128i d1 = _mm_add_epi16(q6, q5);
+      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 28458dc..74c52df 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm

@@ -7,55 +7,18 @@
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
+
+%define private_prefix vp9
+
 %include "third_party/x86inc/x86inc.asm"
 
 ; This file provides SSSE3 version of the forward transformation. Part
 ; of the macro definitions are originally derived from the ffmpeg project.
 ; The current version applies to x86 64-bit only.
 
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585,  11585
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
 SECTION .text
 
 %if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
 ; matrix transpose
 %macro INTERLEAVE_2X 4
   punpckh%1          m%4, m%2, m%3
@@ -83,58 +46,52 @@
   SWAP  %4, %7
 %endmacro
 
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
+%macro HMD8_1D 0
+  psubw              m8, m0, m1
+  psubw              m9, m2, m3
+  paddw              m0, m1
+  paddw              m2, m3
+  SWAP               1, 8
+  SWAP               3, 9
+  psubw              m8, m4, m5
+  psubw              m9, m6, m7
+  paddw              m4, m5
+  paddw              m6, m7
+  SWAP               5, 8
+  SWAP               7, 9
 
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
+  psubw              m8, m0, m2
+  psubw              m9, m1, m3
+  paddw              m0, m2
+  paddw              m1, m3
+  SWAP               2, 8
+  SWAP               3, 9
+  psubw              m8, m4, m6
+  psubw              m9, m5, m7
+  paddw              m4, m6
+  paddw              m5, m7
+  SWAP               6, 8
+  SWAP               7, 9
 
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
+  psubw              m8, m0, m4
+  psubw              m9, m1, m5
+  paddw              m0, m4
+  paddw              m1, m5
+  SWAP               4, 8
+  SWAP               5, 9
+  psubw              m8, m2, m6
+  psubw              m9, m3, m7
+  paddw              m2, m6
+  paddw              m3, m7
+  SWAP               6, 8
+  SWAP               7, 9
 %endmacro
 
 INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
-  pxor              m11, m11
-
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
+
   mova               m0, [inputq]
   mova               m1, [inputq + r3]
   lea                inputq, [inputq + r4]
@@ -147,27 +104,9 @@
   mova               m6, [inputq]
   mova               m7, [inputq + r3]
 
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  FDCT8_1D  0
+  HMD8_1D
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
+  HMD8_1D
 
   mova              [outputq +   0], m0
   mova              [outputq +  16], m1

diff --git a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
new file mode 100644
index 0000000..bf7c7af
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c

@@ -0,0 +1,375 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
+                                             const uint8_t *mc_running_avg_y,
+                                             uint8_t *running_avg_y,
+                                             const __m128i *k_0,
+                                             const __m128i *k_4,
+                                             const __m128i *k_8,
+                                             const __m128i *k_16,
+                                             const __m128i *l3,
+                                             const __m128i *l32,
+                                             const __m128i *l21,
+                                             __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj,  padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y,
+    uint8_t *running_avg_y, const __m128i k_0,
+    const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoiser for 4xM and 8xM blocks.
+static int vp9_denoiser_NxM_sse2_small(
+    const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+    int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+    int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const uint8_t shift = (width == 4) ? 2 : 1;
+
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width,
+           mc_running_avg_y + mc_avg_y_stride, width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    if (width == 4) {
+      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+      memcpy(mc_running_buffer[r] + width * 2,
+             mc_running_avg_y + mc_avg_y_stride * 2, width);
+      memcpy(mc_running_buffer[r] + width * 3,
+             mc_running_avg_y + mc_avg_y_stride * 3, width);
+      memcpy(running_buffer[r] + width * 2,
+             running_avg_y + avg_y_stride * 2, width);
+      memcpy(running_buffer[r] + width * 3,
+             running_avg_y + avg_y_stride * 3, width);
+    }
+    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
+                                      mc_running_buffer[r],
+                                      running_buffer[r],
+                                      &k_0, &k_4, &k_8, &k_16,
+                                      &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    if (width == 4) {
+      memcpy(running_avg_y + avg_y_stride * 2,
+             running_buffer[r] + width * 2, width);
+      memcpy(running_avg_y + avg_y_stride * 3,
+             running_buffer[r] + width * 3, width);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << shift);
+    mc_running_avg_y += (mc_avg_y_stride << shift);
+    running_avg_y += (avg_y_stride << shift);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+          acc_diff = vp9_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+              k_0, k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride,
+                 running_buffer[r] + width, width);
+          if (width == 4) {
+            memcpy(running_avg_y + avg_y_stride * 2,
+                   running_buffer[r] + width * 2, width);
+            memcpy(running_avg_y + avg_y_stride * 3,
+                   running_buffer[r] + width * 3, width);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << shift);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y,
+                                     int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  __m128i acc_diff[4][4];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+
+  for (c = 0; c < 4; ++c) {
+    for (r = 0; r < 4; ++r) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+    mc_running_avg_y = mc_running_avg_y -
+                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                       mc_avg_y_stride;
+    running_avg_y = running_avg_y -
+                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                    avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
+        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sum_diff = 0;
+        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+                sig, mc_running_avg_y, running_avg_y, k_0,
+                k_delta, acc_diff[c>>4][r>>4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+            }
+          }
+          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+          mc_running_avg_y = mc_running_avg_y -
+                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                             mc_avg_y_stride;
+          running_avg_y = running_avg_y -
+                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                          avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg,
+                             int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising,
+                             BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 4);
+  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
+  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
+             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
+             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+    return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
+                                     mc_avg, mc_avg_stride,
+                                     avg, avg_stride,
+                                     increase_denoising,
+                                     bs, motion_magnitude);
+  } else {
+    return COPY_BLOCK;
+  }
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
index c67490f..dfebaab 100644
--- a/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
+++ b/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c

@@ -9,8 +9,9 @@
  */
 
 #include <immintrin.h>  // AVX2
-#include "vpx/vpx_integer.h"
 
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
 
 int64_t vp9_block_error_avx2(const int16_t *coeff,
                              const int16_t *dqcoeff,

diff --git a/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb..5b02382 100644
--- a/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm

@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define private_prefix vp9
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
@@ -72,3 +74,49 @@
   movd    edx, m5
 %endif
   RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET

diff --git a/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000..c245cca
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c

@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
new file mode 100644
index 0000000..2071dfe
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t* zbin_ptr,
+                          const int16_t* round_ptr, const int16_t* quant_ptr,
+                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                          uint16_t* eob_ptr,
+                          const int16_t* scan_ptr,
+                          const int16_t* iscan_ptr) {
+  __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    thr = _mm_srai_epi16(dequant, 1);
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 508e1d4..ec61c0c 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define private_prefix vp9
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
@@ -15,212 +17,9 @@
 
 SECTION .text
 
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
-  mova                            m0, [zbinq]              ; m0 = zbin
-  punpcklwd                       m4, m4
-  mova                            m1, [roundq]             ; m1 = round
-  pshufd                          m4, m4, 0
-  mova                            m2, [quantq]             ; m2 = quant
-  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
@@ -248,11 +47,11 @@
   psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+
   lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
@@ -270,28 +69,30 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
 %endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
   punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                           m8, 1
   psrlw                          m13, 1
   psignw                          m8, m9
   psignw                         m13, m10
   psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                           m8, m6                   ; m8 = max(eob)
@@ -305,15 +106,15 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, fp_32x32
+
   pcmpgtw                         m7, m6,  m0
   pcmpgtw                        m12, m11, m0
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
 
   or                              r6, r2
   jz .skip_iter
-%endif
+
   pcmpeqw                         m7, m7
 
   paddsw                          m6, m1                   ; m6 += round
@@ -322,26 +123,26 @@
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
 %endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                          m14, 1
   psrlw                          m13, 1
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                          m14, m6                   ; m14 = max(eob)
@@ -351,16 +152,14 @@
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
-%ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
   add                        ncoeffq, mmsize
   jl .ac_only_loop
-%endif
 
 .accumulate_eob:
   ; horizontally accumulate/max eobs and write into [eob] memory pointer
@@ -372,7 +171,7 @@
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
   pextrw                          r6, m8, 0
-  mov                             [r2], r6
+  mov                           [r2], r6
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -381,19 +180,19 @@
   movifnidn                  ncoeffq, ncoeffmp
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
   add                        ncoeffq, mmsize
   jl .blank_loop
-  mov                    word [eobq], 0
+  mov                     word [r3q], 0
   RET
 %endmacro
 

diff --git a/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
deleted file mode 100644
index 455d10d..0000000
--- a/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
+++ /dev/null

@@ -1,216 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2) PRIVATE
-sym(vp9_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2) PRIVATE
-sym(vp9_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
deleted file mode 100644
index 7f81f46..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
+++ /dev/null

@@ -1,267 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-typedef void (*get_var_avx2) (
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-void vp9_get16x16var_avx2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-void vp9_get32x32var_avx2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-unsigned int vp9_sub_pixel_variance32xh_avx2
-(
-  const uint8_t *src,
-  int src_stride,
-  int x_offset,
-  int y_offset,
-  const uint8_t *dst,
-  int dst_stride,
-  int height,
-  unsigned int *sse
-);
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2
-(
-  const uint8_t *src,
-  int src_stride,
-  int x_offset,
-  int y_offset,
-  const uint8_t *dst,
-  int dst_stride,
-  const uint8_t *sec,
-  int sec_stride,
-  int height,
-  unsigned int *sseptr
-);
-
-static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
-                        const unsigned char *ref_ptr, int  recon_stride,
-                        int  w, int  h, unsigned int *sse, int *sum,
-                        get_var_avx2 var_fn, int block_size) {
-  unsigned int sse0;
-  int sum0;
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      // processing 16 rows horizontally each call
-      var_fn(src_ptr + source_stride * i + j, source_stride,
-             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-unsigned int vp9_variance16x16_avx2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
-                &var, &avg, vp9_get16x16var_avx2, 16);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp9_mse16x16_avx2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                       &sum0);
-  *sse = sse0;
-  return sse0;
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
-                                    unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 10));
-}
-
-unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
-                                    unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
-}
-
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
-                                    unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 12));
-}
-
-unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
-                                    unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse_ptr) {
-  // processing 32 elements in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                           y_offset, dst, dst_stride,
-                                           64, &sse);
-  // processing the next 32 elements in parallel
-  unsigned int sse2;
-  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
-                                            x_offset, y_offset,
-                                            dst + 32, dst_stride,
-                                            64, &sse2);
-  se += se2;
-  sse += sse2;
-  *sse_ptr = sse;
-  return sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse_ptr) {
-  // processing 32 element in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                           y_offset, dst, dst_stride,
-                                           32, &sse);
-  *sse_ptr = sse;
-  return sse - (((int64_t)se * se) >> 10);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sseptr,
-                                                  const uint8_t *sec) {
-  // processing 32 elements in parallel
-  unsigned int sse;
-
-  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                               y_offset, dst, dst_stride,
-                                               sec, 64, 64, &sse);
-  unsigned int sse2;
-  // processing the next 32 elements in parallel
-  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
-                                                y_offset, dst + 32, dst_stride,
-                                                sec + 32, 64, 64, &sse2);
-  se += se2;
-  sse += sse2;
-  *sseptr = sse;
-
-  return sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sseptr,
-                                                  const uint8_t *sec) {
-  // processing 32 element in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                 y_offset, dst, dst_stride,
-                                                 sec, 32, 32, &sse);
-  *sseptr = sse;
-  return sse - (((int64_t)se * se) >> 10);
-}
-
-

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
deleted file mode 100644
index f992328..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
+++ /dev/null

@@ -1,213 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-void vp9_get16x16var_avx2(const unsigned char *src_ptr,
-                          int source_stride,
-                          const unsigned char *ref_ptr,
-                          int recon_stride,
-                          unsigned int *SSE,
-                          int *Sum) {
-    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-    __m256i ref_expand_high, madd_low, madd_high;
-    unsigned int i, src_2strides, ref_2strides;
-    __m256i zero_reg = _mm256_set1_epi16(0);
-    __m256i sum_ref_src = _mm256_set1_epi16(0);
-    __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-    // processing two strides in a 256 bit register reducing the number
-    // of loop stride by half (comparing to the sse2 code)
-    src_2strides = source_stride << 1;
-    ref_2strides = recon_stride << 1;
-    for (i = 0; i < 8; i++) {
-        src = _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i const *) (src_ptr)));
-        src = _mm256_inserti128_si256(src,
-              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
-
-        ref =_mm256_castsi128_si256(
-             _mm_loadu_si128((__m128i const *) (ref_ptr)));
-        ref = _mm256_inserti128_si256(ref,
-              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
-
-        // expanding to 16 bit each lane
-        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-        // src-ref
-        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-        // madd low (src - ref)
-        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-        // add high to low
-        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-        // madd high (src - ref)
-        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-        // add high to low
-        madd_ref_src = _mm256_add_epi32(madd_ref_src,
-                       _mm256_add_epi32(madd_low, madd_high));
-
-        src_ptr+= src_2strides;
-        ref_ptr+= ref_2strides;
-    }
-
-    {
-        __m128i sum_res, madd_res;
-        __m128i expand_sum_low, expand_sum_high, expand_sum;
-        __m128i expand_madd_low, expand_madd_high, expand_madd;
-        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-        // extract the low lane and add it to the high lane
-        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                                _mm256_extractf128_si256(sum_ref_src, 1));
-
-        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                                 _mm256_extractf128_si256(madd_ref_src, 1));
-
-        // padding each 2 bytes with another 2 zeroed bytes
-        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
-                                            sum_res);
-        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
-                                             sum_res);
-
-        // shifting the sign 16 bits right
-        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-        // expand each 32 bits of the madd result to 64 bits
-        expand_madd_low = _mm_unpacklo_epi32(madd_res,
-                          _mm256_castsi256_si128(zero_reg));
-        expand_madd_high = _mm_unpackhi_epi32(madd_res,
-                           _mm256_castsi256_si128(zero_reg));
-
-        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
-                            _mm256_castsi256_si128(zero_reg));
-        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
-                             _mm256_castsi256_si128(zero_reg));
-
-        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-        // shift 8 bytes eight
-        madd_res = _mm_srli_si128(expand_madd, 8);
-        sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-        madd_res = _mm_add_epi32(madd_res, expand_madd);
-        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
-
-        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
-    }
-}
-
-void vp9_get32x32var_avx2(const unsigned char *src_ptr,
-                          int source_stride,
-                          const unsigned char *ref_ptr,
-                          int recon_stride,
-                          unsigned int *SSE,
-                          int *Sum) {
-    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-    __m256i ref_expand_high, madd_low, madd_high;
-    unsigned int i;
-    __m256i zero_reg = _mm256_set1_epi16(0);
-    __m256i sum_ref_src = _mm256_set1_epi16(0);
-    __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-    // processing 32 elements in parallel
-    for (i = 0; i < 16; i++) {
-       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
-
-       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
-
-       // expanding to 16 bit each lane
-       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-       // src-ref
-       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-       // madd low (src - ref)
-       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-       // add high to low
-       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-       // madd high (src - ref)
-       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-       // add high to low
-       madd_ref_src = _mm256_add_epi32(madd_ref_src,
-                      _mm256_add_epi32(madd_low, madd_high));
-
-       src_ptr+= source_stride;
-       ref_ptr+= recon_stride;
-    }
-
-    {
-      __m256i expand_sum_low, expand_sum_high, expand_sum;
-      __m256i expand_madd_low, expand_madd_high, expand_madd;
-      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-      // padding each 2 bytes with another 2 zeroed bytes
-      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-      // shifting the sign 16 bits right
-      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-      // expand each 32 bits of the madd result to 64 bits
-      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-      // shift 8 bytes eight
-      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-      // extract the low lane and the high lane and add the results
-      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-    }
-}

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
deleted file mode 100644
index 3501cf1..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ /dev/null

@@ -1,510 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx) PRIVATE
-sym(vp9_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get8x8var_mmx) PRIVATE
-sym(vp9_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get4x4var_mmx) PRIVATE
-sym(vp9_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx) PRIVATE
-sym(vp9_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
deleted file mode 100644
index 4830412..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ /dev/null

@@ -1,401 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2) PRIVATE
-sym(vp9_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get16x16var_sse2) PRIVATE
-sym(vp9_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
deleted file mode 100644
index ce1c832..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
-
-unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *SSE, int *sum);
-
-unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride,
-                                 unsigned int *sse) {
-  int sum;
-  vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride,
-                                 unsigned int *sse) {
-  int sum;
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3;
-  int sum0, sum1, sum2, sum3;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
-  *sse = sse0 + sse1 + sse2 + sse3;
-  return *sse;
-}
-
-
-unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3;
-  int sum0, sum1, sum2, sum3, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
-  *sse = sse0 + sse1 + sse2 + sse3;
-  sum = sum0 + sum1 + sum2 + sum3;
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  unsigned int sse0, sse1;
-  int sum0, sum1, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-
-  *sse = sse0 + sse1;
-  sum = sum0 + sum1;
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-
-unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  unsigned int sse0, sse1;
-  int sum0, sum1, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
-
-  *sse = sse0 + sse1;
-  sum = sum0 + sum1;
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}

diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
deleted file mode 100644
index e935a23..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ /dev/null

@@ -1,334 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
-                                       const unsigned char *ref, int ref_stride,
-                                       unsigned int *sse, int *sum);
-
-unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
-
-
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
-                                const unsigned char *ref, int ref_stride,
-                                unsigned int *sse, int *sum);
-
-unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse, int *sum);
-
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
-                sse, &sum, vp9_get4x4var_mmx, 4);
-  return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
-                sse, &sum, vp9_get4x4var_mmx, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
-                sse, &sum, vp9_get4x4var_mmx, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-#define DECL(w, opt) \
-int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
-                                        ptrdiff_t src_stride, \
-                                        int x_offset, int y_offset, \
-                                        const uint8_t *dst, \
-                                        ptrdiff_t dst_stride, \
-                                        int height, unsigned int *sse)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                     int src_stride, \
-                                                     int x_offset, \
-                                                     int y_offset, \
-                                                     const uint8_t *dst, \
-                                                     int dst_stride, \
-                                                     unsigned int *sse_ptr) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                y_offset, dst, dst_stride, \
-                                                h, &sse); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                   x_offset, y_offset, \
-                                                   dst + 16, dst_stride, \
-                                                   h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 32, dst_stride, \
-                                                 h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 48, dst_stride, \
-                                                 h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
-
-#define DECL(w, opt) \
-int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
-                                            ptrdiff_t src_stride, \
-                                            int x_offset, int y_offset, \
-                                            const uint8_t *dst, \
-                                            ptrdiff_t dst_stride, \
-                                            const uint8_t *sec, \
-                                            ptrdiff_t sec_stride, \
-                                            int height, unsigned int *sse)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                         int src_stride, \
-                                                         int x_offset, \
-                                                         int y_offset, \
-                                                         const uint8_t *dst, \
-                                                         int dst_stride, \
-                                                         unsigned int *sseptr, \
-                                                         const uint8_t *sec) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                    y_offset, dst, dst_stride, \
-                                                    sec, w, h, &sse); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst + 16, dst_stride, \
-                                                       sec + 16, w, h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 32, dst_stride, \
-                                                     sec + 32, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 48, dst_stride, \
-                                                     sec + 48, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN

diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 8e3e885..d0135c6 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk

@@ -13,14 +13,10 @@
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.c
@@ -31,14 +27,15 @@
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
 VP9_COMMON_SRCS-yes += common/vp9_mv.h
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
-VP9_COMMON_SRCS-yes += common/vp9_prob.h
-VP9_COMMON_SRCS-yes += common/vp9_prob.c
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
@@ -48,14 +45,11 @@
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
-VP9_COMMON_SRCS-yes += common/vp9_thread.h
-VP9_COMMON_SRCS-yes += common/vp9_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -67,86 +61,35 @@
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
-ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
-endif
-
-# common (c)
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred4_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred8_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred16_dspr2.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
-
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c
-ifeq ($(ARCH_X86_64), yes)
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+# common (msa)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))

diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index bf8eec7..f155b9a 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c

@@ -12,7 +12,8 @@
 #include <string.h>
 
 #include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -21,7 +22,6 @@
 #include "vp9/vp9_iface_common.h"
 
 struct vp9_extracfg {
-  struct vpx_codec_pkt_list *pkt_list;
   int                         cpu_used;  // available cpu percentage in 1/16
   unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
@@ -31,49 +31,46 @@
   unsigned int                tile_rows;
   unsigned int                arnr_max_frames;
   unsigned int                arnr_strength;
-  unsigned int                arnr_type;
+  unsigned int                min_gf_interval;
+  unsigned int                max_gf_interval;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
+  unsigned int                rc_max_inter_bitrate_pct;
+  unsigned int                gf_cbr_boost_pct;
   unsigned int                lossless;
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
-  BIT_DEPTH                   bit_depth;
+  vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
 };
 
-struct extraconfig_map {
-  unsigned int usage;
-  struct vp9_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
-  {
-    0,
-    { // NOLINT
-      NULL,
-      0,                          // cpu_used
-      1,                          // enable_auto_alt_ref
-      0,                          // noise_sensitivity
-      0,                          // sharpness
-      0,                          // static_thresh
-      0,                          // tile_columns
-      0,                          // tile_rows
-      7,                          // arnr_max_frames
-      5,                          // arnr_strength
-      3,                          // arnr_type
-      VP8_TUNE_PSNR,              // tuning
-      10,                         // cq_level
-      0,                          // rc_max_intra_bitrate_pct
-      0,                          // lossless
-      0,                          // frame_parallel_decoding_mode
-      NO_AQ,                      // aq_mode
-      0,                          // frame_periodic_delta_q
-      BITS_8,                     // Bit depth
-      VP9E_CONTENT_DEFAULT        // content
-    }
-  }
+static struct vp9_extracfg default_extra_cfg = {
+  0,                          // cpu_used
+  1,                          // enable_auto_alt_ref
+  0,                          // noise_sensitivity
+  0,                          // sharpness
+  0,                          // static_thresh
+  6,                          // tile_columns
+  0,                          // tile_rows
+  7,                          // arnr_max_frames
+  5,                          // arnr_strength
+  0,                          // min_gf_interval; 0 -> default decision
+  0,                          // max_gf_interval; 0 -> default decision
+  VP8_TUNE_PSNR,              // tuning
+  10,                         // cq_level
+  0,                          // rc_max_intra_bitrate_pct
+  0,                          // rc_max_inter_bitrate_pct
+  0,                          // gf_cbr_boost_pct
+  0,                          // lossless
+  1,                          // frame_parallel_decoding_mode
+  NO_AQ,                      // aq_mode
+  0,                          // frame_periodic_delta_q
+  VPX_BITS_8,                 // Bit depth
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
 };
 
 struct vpx_codec_alg_priv {
@@ -90,9 +87,13 @@
   size_t                  pending_frame_sizes[8];
   size_t                  pending_frame_magnitude;
   vpx_image_t             preview_img;
+  vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
   vpx_codec_pkt_list_decl(256) pkt_list;
   unsigned int                 fixed_kf_cntr;
+  vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
 };
 
 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -161,8 +162,8 @@
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
   RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
@@ -170,16 +171,48 @@
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
+  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+      (MAX_LAG_BUFFERS - 1));
+  }
 
   if (cfg->rc_resize_allowed == 1) {
-    RANGE_CHECK(cfg, rc_scaled_width, 1, cfg->g_w);
-    RANGE_CHECK(cfg, rc_scaled_height, 1, cfg->g_h);
+    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
+    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
   }
 
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
+  RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+
+  if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS)
+    ERROR("ss_number_layers * ts_number_layers is out of range");
+  if (cfg->ts_number_layers > 1) {
+    unsigned int sl, tl;
+    for (sl = 1; sl < cfg->ss_number_layers; ++sl) {
+      for (tl = 1; tl < cfg->ts_number_layers; ++tl) {
+        const int layer =
+            LAYER_IDS_TO_IDX(sl, tl, cfg->ts_number_layers);
+        if (cfg->layer_target_bitrate[layer] <
+            cfg->layer_target_bitrate[layer - 1])
+        ERROR("ts_target_bitrate entries are not increasing");
+      }
+    }
+
+    RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1);
+    for (tl = cfg->ts_number_layers - 2; tl > 0; --tl)
+      if (cfg->ts_rate_decimator[tl - 1] != 2 * cfg->ts_rate_decimator[tl])
+        ERROR("ts_rate_decimator factors are not powers of 2");
+  }
 
 #if CONFIG_SPATIAL_SVC
-  if (cfg->ss_number_layers > 1) {
+
+  if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
+      cfg->g_pass == VPX_RC_LAST_PASS) {
     unsigned int i, alt_ref_sum = 0;
     for (i = 0; i < cfg->ss_number_layers; ++i) {
       if (cfg->ss_enable_auto_alt_ref[i])
@@ -187,22 +220,12 @@
     }
     if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
       ERROR("Not enough ref buffers for svc alt ref frames");
+    if (cfg->ss_number_layers * cfg->ts_number_layers > 3 &&
+        cfg->g_error_resilient == 0)
+    ERROR("Multiple frame context are not supported for more than 3 layers");
   }
 #endif
 
-  RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
-  if (cfg->ts_number_layers > 1) {
-    unsigned int i;
-    for (i = 1; i < cfg->ts_number_layers; ++i)
-      if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i - 1])
-        ERROR("ts_target_bitrate entries are not increasing");
-
-    RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1);
-    for (i = cfg->ts_number_layers - 2; i > 0; --i)
-      if (cfg->ts_rate_decimator[i - 1] != 2 * cfg->ts_rate_decimator[i])
-        ERROR("ts_rate_decimator factors are not powers of 2");
-  }
-
   // VP9 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
   if (cfg->kf_mode != VPX_KF_DISABLED &&
@@ -211,16 +234,17 @@
     ERROR("kf_min_dist not supported in auto mode, use 0 "
           "or kf_max_dist instead.");
 
-  RANGE_CHECK_BOOL(extra_cfg,  enable_auto_alt_ref);
-  RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
-  RANGE_CHECK(extra_cfg, arnr_type, 1, 3);
   RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content,
               VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
 
@@ -239,7 +263,7 @@
     if (cfg->rc_twopass_stats_in.sz % packet_sz)
       ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
 
-    if (cfg->ss_number_layers > 1) {
+    if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) {
       int i;
       unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0};
 
@@ -279,24 +303,50 @@
     }
   }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
+    ERROR("Profile > 1 not supported in this build configuration");
+  }
+#endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth > BITS_8)
-    ERROR("High bit-depth not supported in profile < 2");
+      cfg->g_bit_depth > VPX_BITS_8) {
+    ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
   if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth == BITS_8)
-    ERROR("Bit-depth 8 not supported in profile > 1");
-
+      cfg->g_bit_depth == VPX_BITS_8) {
+    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   return VPX_CODEC_OK;
 }
 
-
 static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
                                     const vpx_image_t *img) {
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I42016:
+      break;
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR("Invalid image format. I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR("Invalid image format. 16-bit I422, I444, I440 images are "
+              "not supported in profile.");
+      }
       break;
     default:
       ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
@@ -316,35 +366,43 @@
     case VPX_IMG_FMT_I420: return 12;
     case VPX_IMG_FMT_I422: return 16;
     case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I440: return 16;
+    case VPX_IMG_FMT_I42016: return 24;
+    case VPX_IMG_FMT_I42216: return 32;
+    case VPX_IMG_FMT_I44416: return 48;
+    case VPX_IMG_FMT_I44016: return 32;
     default: assert(0 && "Invalid image format"); break;
   }
   return 0;
 }
 
 static vpx_codec_err_t set_encoder_config(
-    VP9EncoderConfig *oxcf,
-    const vpx_codec_enc_cfg_t *cfg,
-    const struct vp9_extracfg *extra_cfg) {
+  VP9EncoderConfig *oxcf,
+  const vpx_codec_enc_cfg_t *cfg,
+  const struct vp9_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
+  int sl, tl;
   oxcf->profile = cfg->g_profile;
+  oxcf->max_threads = (int)cfg->g_threads;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
-  oxcf->bit_depth = extra_cfg->bit_depth;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
-  oxcf->framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (oxcf->framerate > 180)
-    oxcf->framerate = 30;
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180)
+    oxcf->init_framerate = 30;
+
+  oxcf->mode = GOOD;
 
   switch (cfg->g_pass) {
     case VPX_RC_ONE_PASS:
-      oxcf->mode = ONE_PASS_GOOD;
       oxcf->pass = 0;
       break;
     case VPX_RC_FIRST_PASS:
-      oxcf->mode = TWO_PASS_FIRST;
       oxcf->pass = 1;
       break;
     case VPX_RC_LAST_PASS:
-      oxcf->mode = TWO_PASS_SECOND_BEST;
       oxcf->pass = 2;
       break;
   }
@@ -356,6 +414,8 @@
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
 
   oxcf->best_allowed_q =
       extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
@@ -367,13 +427,19 @@
   oxcf->under_shoot_pct         = cfg->rc_undershoot_pct;
   oxcf->over_shoot_pct          = cfg->rc_overshoot_pct;
 
-  oxcf->allow_spatial_resampling = cfg->rc_resize_allowed;
-  oxcf->scaled_frame_width       = cfg->rc_scaled_width;
-  oxcf->scaled_frame_height      = cfg->rc_scaled_height;
+  oxcf->scaled_frame_width  = cfg->rc_scaled_width;
+  oxcf->scaled_frame_height = cfg->rc_scaled_height;
+  if (cfg->rc_resize_allowed == 1) {
+    oxcf->resize_mode =
+        (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) ?
+            RESIZE_DYNAMIC : RESIZE_FIXED;
+  } else {
+    oxcf->resize_mode = RESIZE_NONE;
+  }
 
-  oxcf->maximum_buffer_size_ms   = cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms  = cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
 
@@ -388,20 +454,21 @@
 
   oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
-  oxcf->play_alternate         =  extra_cfg->enable_auto_alt_ref;
+  oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
   oxcf->sharpness              =  extra_cfg->sharpness;
 
   oxcf->two_pass_stats_in      =  cfg->rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  extra_cfg->pkt_list;
 
 #if CONFIG_FP_MB_STATS
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
 #endif
 
+  oxcf->color_space = extra_cfg->color_space;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
-  oxcf->arnr_type       = extra_cfg->arnr_type;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
@@ -417,32 +484,33 @@
   oxcf->frame_periodic_boost =  extra_cfg->frame_periodic_boost;
 
   oxcf->ss_number_layers = cfg->ss_number_layers;
-
-  if (oxcf->ss_number_layers > 1) {
-    int i;
-    for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
-      oxcf->ss_target_bitrate[i] =  1000 * cfg->ss_target_bitrate[i];
-#if CONFIG_SPATIAL_SVC
-      oxcf->ss_play_alternate[i] =  cfg->ss_enable_auto_alt_ref[i];
-#endif
-    }
-  } else if (oxcf->ss_number_layers == 1) {
-    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
-  }
-
   oxcf->ts_number_layers = cfg->ts_number_layers;
+  oxcf->temporal_layering_mode = (enum vp9e_temporal_layering_mode)
+      cfg->temporal_layering_mode;
 
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+#if CONFIG_SPATIAL_SVC
+    oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
+#endif
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
+          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+    }
+  }
+  if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+#if CONFIG_SPATIAL_SVC
+    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
+#endif
+  }
   if (oxcf->ts_number_layers > 1) {
-    int i;
-    for (i = 0; i < VPX_TS_MAX_LAYERS; ++i) {
-      oxcf->ts_target_bitrate[i] = 1000 * cfg->ts_target_bitrate[i];
-      oxcf->ts_rate_decimator[i] = cfg->ts_rate_decimator[i];
+    for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) {
+      oxcf->ts_rate_decimator[tl] = cfg->ts_rate_decimator[tl] ?
+          cfg->ts_rate_decimator[tl] : 1;
     }
   } else if (oxcf->ts_number_layers == 1) {
-    oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
     oxcf->ts_rate_decimator[0] = 1;
   }
-
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -468,7 +536,7 @@
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-  printf("play_alternate: %d\n", oxcf->play_alternate);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
   printf("Version: %d\n", oxcf->Version);
   printf("encode_breakout: %d\n", oxcf->encode_breakout);
   printf("error resilient: %d\n", oxcf->error_resilient_mode);
@@ -481,9 +549,16 @@
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t  *cfg) {
   vpx_codec_err_t res;
+  int force_key = 0;
 
-  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
-    ERROR("Cannot change width or height after initialization");
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
   // to be -- the limit is not increasing past the first lag_in_frames
@@ -497,9 +572,14 @@
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 
+  if (force_key)
+    ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
   return res;
 }
 
@@ -549,7 +629,7 @@
 static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args);
+  extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -597,9 +677,9 @@
 
 static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
-  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
-  return update_extra_cfg(ctx, &extra_cfg);
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
@@ -624,6 +704,22 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct =
+      CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -646,6 +742,20 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_min_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(VP9E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_max_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(VP9E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -659,52 +769,46 @@
   (void)data;
 
   if (ctx->priv == NULL) {
-    int i;
-    vpx_codec_enc_cfg_t *cfg;
-    struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv));
-
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
     if (priv == NULL)
       return VPX_CODEC_MEM_ERROR;
 
-    ctx->priv = &priv->base;
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = priv;
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
 
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
-      ctx->priv->alg_priv->cfg = *ctx->config.enc;
-      ctx->config.enc = &ctx->priv->alg_priv->cfg;
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
     }
 
-    cfg = &ctx->priv->alg_priv->cfg;
-
-    // Select the extra vp6 configuration table based on the current
-    // usage value. If the current usage value isn't found, use the
-    // values for usage case 0.
-    for (i = 0;
-         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         ++i) {}
-
-    priv->extra_cfg = extracfg_map[i].cfg;
-    priv->extra_cfg.pkt_list = &priv->pkt_list.head;
-
-    vp9_initialize_enc();
+    priv->extra_cfg = default_extra_cfg;
+    once(vp9_initialize_enc);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
     if (res == VPX_CODEC_OK) {
-      VP9_COMP *cpi;
-      set_encoder_config(&ctx->priv->alg_priv->oxcf,
-                         &ctx->priv->alg_priv->cfg,
-                         &ctx->priv->alg_priv->extra_cfg);
-      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-      if (cpi == NULL)
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+      priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
+      if (priv->cpi == NULL)
         res = VPX_CODEC_MEM_ERROR;
       else
-        ctx->priv->alg_priv->cpi = cpi;
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
     }
   }
 
@@ -714,35 +818,44 @@
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
   vp9_remove_compressor(ctx->cpi);
-  free(ctx);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
                                     unsigned long deadline) {
-  // Use best quality mode if no deadline is given.
-  MODE new_qc = ONE_PASS_BEST;
+  MODE new_mode = BEST;
 
-  if (deadline) {
-    // Convert duration parameter from stream timebase to microseconds
-    const uint64_t duration_us = (uint64_t)duration * 1000000 *
-                               (uint64_t)ctx->cfg.g_timebase.num /
-                               (uint64_t)ctx->cfg.g_timebase.den;
+  switch (ctx->cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      if (deadline > 0) {
+        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
 
-    // If the deadline is more that the duration this frame is to be shown,
-    // use good quality mode. Otherwise use realtime mode.
-    new_qc = (deadline > duration_us) ? ONE_PASS_GOOD : REALTIME;
+        // Convert duration parameter from stream timebase to microseconds.
+        const uint64_t duration_us = (uint64_t)duration * 1000000 *
+           (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+        // If the deadline is more that the duration this frame is to be shown,
+        // use good quality mode. Otherwise use realtime mode.
+        new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+      } else {
+        new_mode = BEST;
+      }
+      break;
+    case VPX_RC_FIRST_PASS:
+      break;
+    case VPX_RC_LAST_PASS:
+      new_mode = deadline > 0 ? GOOD : BEST;
+      break;
   }
 
-  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
-    new_qc = TWO_PASS_FIRST;
-  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
-    new_qc = (new_qc == ONE_PASS_BEST) ? TWO_PASS_SECOND_BEST
-                                          : TWO_PASS_SECOND_GOOD;
-
-  if (ctx->oxcf.mode != new_qc) {
-    ctx->oxcf.mode = new_qc;
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 }
@@ -821,6 +934,24 @@
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY ||
+      (cpi->use_svc &&
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+              cpi->svc.number_temporal_layers +
+              cpi->svc.temporal_layer_id].is_key_frame)
+     )
+    flags |= VPX_FRAME_IS_KEY;
+
+  if (cpi->droppable)
+    flags |= VPX_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                       const vpx_image_t *img,
                                       vpx_codec_pts_t pts,
@@ -828,23 +959,28 @@
                                       vpx_enc_frame_flags_t flags,
                                       unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
+  VP9_COMP *const cpi = ctx->cpi;
   const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  size_t data_sz;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
-    if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+    if (res == VPX_CODEC_OK && cpi != NULL) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
-      ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
-                        get_image_bps(img) / 8 *
-                        (ctx->cpi->multi_arf_allowed ? 8 : 2);
-      if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
-
-      ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
-      if (ctx->cx_data == NULL) {
-        return VPX_CODEC_MEM_ERROR;
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+                (cpi->multi_arf_allowed ? 8 : 2);
+      if (data_sz < 4096)
+        data_sz = 4096;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char*)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return VPX_CODEC_MEM_ERROR;
+        }
       }
     }
   }
@@ -859,7 +995,7 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  vp9_apply_encoding_flags(ctx->cpi, flags);
+  vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -871,7 +1007,7 @@
   }
 
   // Initialize the encoder instance on the first frame.
-  if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
+  if (res == VPX_CODEC_OK && cpi != NULL) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -882,18 +1018,18 @@
 
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
-      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+      cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
-      if (vp9_receive_raw_frame(ctx->cpi, flags,
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
         res = update_error_state(ctx, &cpi->common.error);
       }
+      ctx->next_frame_flags = 0;
     }
 
     cx_data = ctx->cx_data;
@@ -916,24 +1052,22 @@
     }
 
     while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
                                          cx_data, &dst_time_stamp,
                                          &dst_end_time_stamp, !img)) {
       if (size) {
-        VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
         vpx_codec_cx_pkt_t pkt;
 
 #if CONFIG_SPATIAL_SVC
-        if (is_spatial_svc(cpi))
-          cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size;
+        if (cpi->use_svc)
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+              cpi->svc.number_temporal_layers].layer_size += size;
 #endif
 
         // Pack invisible frames with the next visible frame
-        if (cpi->common.show_frame == 0
-#if CONFIG_SPATIAL_SVC
-            || (is_spatial_svc(cpi) &&
-                cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
-#endif
+        if (!cpi->common.show_frame ||
+            (cpi->use_svc &&
+             cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
             ) {
           if (ctx->pending_cx_data == 0)
             ctx->pending_cx_data = cx_data;
@@ -942,6 +1076,24 @@
           ctx->pending_frame_magnitude |= size;
           cx_data += size;
           cx_data_sz -= size;
+
+          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+            pkt.data.frame.pts = ticks_to_timebase_units(timebase,
+                                                         dst_time_stamp);
+            pkt.data.frame.duration =
+               (unsigned long)ticks_to_timebase_units(timebase,
+                   dst_end_time_stamp - dst_time_stamp);
+            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+            pkt.data.frame.buf = ctx->pending_cx_data;
+            pkt.data.frame.sz  = size;
+            ctx->pending_cx_data = NULL;
+            ctx->pending_cx_data_sz = 0;
+            ctx->pending_frame_count = 0;
+            ctx->pending_frame_magnitude = 0;
+            ctx->output_cx_pkt_cb.output_cx_pkt(
+                &pkt, ctx->output_cx_pkt_cb.user_priv);
+          }
           continue;
         }
 
@@ -951,36 +1103,15 @@
         pkt.data.frame.duration =
            (unsigned long)ticks_to_timebase_units(timebase,
                dst_end_time_stamp - dst_time_stamp);
-        pkt.data.frame.flags = lib_flags << 16;
-
-        if (lib_flags & FRAMEFLAGS_KEY
-#if CONFIG_SPATIAL_SVC
-            || (is_spatial_svc(cpi) &&
-                cpi->svc.layer_context[0].is_key_frame)
-#endif
-            )
-          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
-        if (cpi->common.show_frame == 0) {
-          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
-          // This timestamp should be as close as possible to the
-          // prior PTS so that if a decoder uses pts to schedule when
-          // to do this, we start right after last frame was decoded.
-          // Invisible frames have no duration.
-          pkt.data.frame.pts =
-              ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
-          pkt.data.frame.duration = 0;
-        }
-
-        if (cpi->droppable)
-          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
           ctx->pending_frame_magnitude |= size;
           ctx->pending_cx_data_sz += size;
-          size += write_superframe_index(ctx);
+          // write the superframe only for the case when
+          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+            size += write_superframe_index(ctx);
           pkt.data.frame.buf = ctx->pending_cx_data;
           pkt.data.frame.sz  = ctx->pending_cx_data_sz;
           ctx->pending_cx_data = NULL;
@@ -992,21 +1123,43 @@
           pkt.data.frame.sz  = size;
         }
         pkt.data.frame.partition_id = -1;
-        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+        if(ctx->output_cx_pkt_cb.output_cx_pkt)
+          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt,
+                                              ctx->output_cx_pkt_cb.user_priv);
+        else
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
         cx_data += size;
         cx_data_sz -= size;
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
 #if CONFIG_SPATIAL_SVC
-        if (is_spatial_svc(cpi)) {
-          vpx_codec_cx_pkt_t pkt = {0};
-          int i;
-          pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
-          for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-            pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
-            cpi->svc.layer_context[i].layer_size = 0;
+        if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
+          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
+          int sl;
+          vp9_zero(pkt_sizes);
+          vp9_zero(pkt_psnr);
+          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
+          for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+            LAYER_CONTEXT *lc =
+                &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
+            pkt_sizes.data.layer_sizes[sl] = lc->layer_size;
+            pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt;
+            lc->layer_size = 0;
           }
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
         }
 #endif
+#endif
+        if (is_one_pass_cbr_svc(cpi) &&
+            (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+          // Encoded all spatial layers; exit loop.
+          break;
+        }
       }
     }
   }
@@ -1152,6 +1305,21 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp9_get_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
@@ -1169,16 +1337,20 @@
 static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
   int data = va_arg(args, int);
   const vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  // Both one-pass and two-pass RC are supported now.
+  // User setting this has to make sure of the following.
+  // In two-pass setting: either (but not both)
+  //      cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1
+  // In one-pass setting:
+  //      either or both cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1
 
   vp9_set_svc(ctx->cpi, data);
-  // CBR or two pass mode for SVC with both temporal and spatial layers
-  // not yet supported.
+
   if (data == 1 &&
-      (cfg->rc_end_usage == VPX_CBR ||
-       cfg->g_pass == VPX_RC_FIRST_PASS ||
+      (cfg->g_pass == VPX_RC_FIRST_PASS ||
        cfg->g_pass == VPX_RC_LAST_PASS) &&
-      cfg->ss_number_layers > 1 &&
-      cfg->ts_number_layers > 1) {
+       cfg->ss_number_layers > 1 &&
+       cfg->ts_number_layers > 1) {
     return VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_OK;
@@ -1204,24 +1376,48 @@
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *);
+  VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+  SVC *const svc = &cpi->svc;
+
+  data->spatial_layer_id = svc->spatial_layer_id;
+  data->temporal_layer_id = svc->temporal_layer_id;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
-  vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *);
+  vpx_svc_extra_cfg_t *const params = va_arg(args, vpx_svc_extra_cfg_t *);
+  int sl, tl;
 
-  if (params == NULL || params->spatial_layer < 0 ||
-      params->spatial_layer >= cpi->svc.number_spatial_layers)
-    return VPX_CODEC_INVALID_PARAM;
-
-  if (params->spatial_layer == 0) {
-    int i;
-    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-      cpi->svc.layer_context[i].svc_params_received.spatial_layer = -1;
+  // Number of temporal layers and number of spatial layers have to be set
+  // properly before calling this control function.
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    for (tl = 0; tl < cpi->svc.number_temporal_layers; ++tl) {
+      const int layer =
+          LAYER_IDS_TO_IDX(sl, tl, cpi->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc =
+          &cpi->svc.layer_context[layer];
+      lc->max_q = params->max_quantizers[sl];
+      lc->min_q = params->min_quantizers[sl];
+      lc->scaling_factor_num = params->scaling_factor_num[sl];
+      lc->scaling_factor_den = params->scaling_factor_den[sl];
     }
   }
 
-  cpi->svc.layer_context[params->spatial_layer].svc_params_received =
-      *params;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
+      (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
+  ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
+  ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
 
   return VPX_CODEC_OK;
 }
@@ -1233,6 +1429,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
@@ -1246,7 +1449,6 @@
   {VP8E_SET_ACTIVEMAP,                ctrl_set_active_map},
   {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
   {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
-  {VP8E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
   {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
   {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
   {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
@@ -1258,19 +1460,28 @@
   {VP8E_SET_TUNING,                   ctrl_set_tuning},
   {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP9E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
+  {VP9E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
   {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
   {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},
   {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_frame_periodic_boost},
   {VP9E_SET_SVC,                      ctrl_set_svc},
   {VP9E_SET_SVC_PARAMETERS,           ctrl_set_svc_parameters},
+  {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
+  {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
+  {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
+  {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
   {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
+  {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
+  {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
 
   { -1, NULL},
 };
@@ -1280,11 +1491,14 @@
     0,
     {  // NOLINT
       0,                  // g_usage
-      0,                  // g_threads
+      8,                  // g_threads
       0,                  // g_profile
 
       320,                // g_width
       240,                // g_height
+      VPX_BITS_8,         // g_bit_depth
+      8,                  // g_input_bit_depth
+
       {1, 30},            // g_timebase
 
       0,                  // g_error_resilient
@@ -1295,21 +1509,19 @@
 
       0,                  // rc_dropframe_thresh
       0,                  // rc_resize_allowed
-      1,                  // rc_scaled_width
-      1,                  // rc_scaled_height
+      0,                  // rc_scaled_width
+      0,                  // rc_scaled_height
       60,                 // rc_resize_down_thresold
       30,                 // rc_resize_up_thresold
 
       VPX_VBR,            // rc_end_usage
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
       {NULL, 0},          // rc_twopass_stats_in
       {NULL, 0},          // rc_firstpass_mb_stats_in
-#endif
       256,                // rc_target_bandwidth
       0,                  // rc_min_quantizer
       63,                 // rc_max_quantizer
-      100,                // rc_undershoot_pct
-      100,                // rc_overshoot_pct
+      25,                 // rc_undershoot_pct
+      25,                 // rc_overshoot_pct
 
       6000,               // rc_max_buffer_size
       4000,               // rc_buffer_initial_size
@@ -1332,9 +1544,8 @@
       {0},                    // ts_rate_decimator
       0,                      // ts_periodicity
       {0},                    // ts_layer_id
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-      "vp8.fpf"           // first pass filename
-#endif
+      {0},                  // layer_taget_bitrate
+      0                     // temporal_layering_mode
     }
   },
 };
@@ -1345,16 +1556,19 @@
 CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   "WebM Project VP9 Encoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t
   encoder_init,       // vpx_codec_init_fn_t
   encoder_destroy,    // vpx_codec_destroy_fn_t
   encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
   {  // NOLINT
-    NOT_IMPLEMENTED,  // vpx_codec_peek_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_decode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_frame_get_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_set_fb_fn_t
+    NULL,  // vpx_codec_peek_si_fn_t
+    NULL,  // vpx_codec_get_si_fn_t
+    NULL,  // vpx_codec_decode_fn_t
+    NULL,  // vpx_codec_frame_get_fn_t
+    NULL   // vpx_codec_set_fb_fn_t
   },
   {  // NOLINT
     1,                      // 1 cfg map
@@ -1362,8 +1576,8 @@
     encoder_encode,         // vpx_codec_encode_fn_t
     encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
     encoder_set_config,     // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,        // vpx_codec_get_global_headers_fn_t
+    NULL,        // vpx_codec_get_global_headers_fn_t
     encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED         // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL         // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };

diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 4372ac9..96ede3c 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c

@@ -11,17 +11,20 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_config.h"
 #include "./vpx_version.h"
 
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
 
@@ -29,20 +32,46 @@
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t        base;
   vpx_codec_dec_cfg_t     cfg;
   vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
   vpx_decrypt_cb          decrypt_cb;
-  void                   *decrypt_state;
+  void                    *decrypt_state;
   vpx_image_t             img;
   int                     img_avail;
   int                     flushed;
   int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+  int                     byte_alignment;
+  int                     skip_loop_filter;
+
+  // Frame parallel related.
   int                     frame_parallel_decode;  // frame-based threading.
+  VPxWorker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+  int                     need_resync;      // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
 
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
@@ -58,29 +87,21 @@
   (void)data;
 
   if (!ctx->priv) {
-    vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
-    if (alg_priv == NULL)
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
       return VPX_CODEC_MEM_ERROR;
 
-    vp9_zero(*alg_priv);
-
-    ctx->priv = (vpx_codec_priv_t *)alg_priv;
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = alg_priv;
-    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
-    ctx->priv->alg_priv->flushed = 0;
-    ctx->priv->alg_priv->frame_parallel_decode =
-        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
-
-    // Disable frame parallel decoding for now.
-    ctx->priv->alg_priv->frame_parallel_decode = 0;
-
+    priv->si.sz = sizeof(priv->si);
+    priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
+    priv->frame_parallel_decode =
+        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
     if (ctx->config.dec) {
-      // Update the reference to the config structure to an internal copy.
-      ctx->priv->alg_priv->cfg = *ctx->config.dec;
-      ctx->config.dec = &ctx->priv->alg_priv->cfg;
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
     }
   }
 
@@ -88,24 +109,48 @@
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VPxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vpx_get_worker_interface()->end(worker);
+      vp9_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_VP9_POSTPROC
+      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
+#endif
+      vp9_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
   }
 
-  vpx_free(ctx);
+  if (ctx->buffer_pool) {
+    vp9_free_ref_frame_buffers(ctx->buffer_pool);
+    vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
 
+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
   return VPX_CODEC_OK;
 }
 
 static int parse_bitdepth_colorspace_sampling(
-    BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
-  const int sRGB = 7;
-  int colorspace;
+    BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) {
+  vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  colorspace = vp9_rb_read_literal(rb, 3);
-  if (colorspace != sRGB) {
+  color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
       rb->bit_offset += 2;  // subsampling x/y.
@@ -146,26 +191,30 @@
   {
     int show_frame;
     int error_resilient;
-    struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
-    const int frame_marker = vp9_rb_read_literal(&rb, 2);
+    struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vpx_rb_read_literal(&rb, 2);
     const BITSTREAM_PROFILE profile = vp9_read_profile(&rb);
 
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+    if (profile >= MAX_PROFILES)
+      return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (vp9_rb_read_bit(&rb)) {  // show an existing frame
-      vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
     if (data_sz <= 8)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    si->is_kf = !vp9_rb_read_bit(&rb);
-    show_frame = vp9_rb_read_bit(&rb);
-    error_resilient = vp9_rb_read_bit(&rb);
+    si->is_kf = !vpx_rb_read_bit(&rb);
+    show_frame = vpx_rb_read_bit(&rb);
+    error_resilient = vpx_rb_read_bit(&rb);
 
     if (si->is_kf) {
       if (!vp9_read_sync_code(&rb))
@@ -175,7 +224,7 @@
         return VPX_CODEC_UNSUP_BITSTREAM;
       vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
     } else {
-      intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+      intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb);
 
       rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 
@@ -213,32 +262,46 @@
   return VPX_CODEC_OK;
 }
 
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                            const struct vpx_internal_error_info *error) {
   if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
 
   return error->error_code;
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;
 
-  cm->new_fb_idx = -1;
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
 
-  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
-  } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
+    cm->skip_loop_filter = ctx->skip_loop_filter;
 
-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to initialize internal frame buffers");
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = vp9_get_frame_buffer;
+      pool->release_fb_cb = vp9_release_frame_buffer;
 
-    cm->cb_priv = &cm->int_frame_buffers;
+      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
   }
 }
 
@@ -257,14 +320,127 @@
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
 
-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+  frame_worker_data->result =
+      vp9_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      vp9_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      vp9_frameworker_signal_stats(worker);
+      vp9_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+  ctx->frame_workers = (VPxWorker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (VPxWorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
+  }
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -273,20 +449,24 @@
     set_default_ppflags(&ctx->postproc_cfg);
 
   init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
+}
+
+static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
+                                const VP9Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
 }
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0, 0, 0};
-  VP9_COMMON *cm = NULL;
-
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   (void)deadline;
 
-  vp9_zero(sd);
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -302,109 +482,102 @@
       return VPX_CODEC_ERROR;
   }
 
-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
+  if (!ctx->frame_parallel_decode) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
+    frame_worker_data->received_frame = 1;
+
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+
+    worker->had_error = 0;
+    winterface->execute(worker);
+
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+    check_resync(ctx, frame_worker_data->pbi);
+  } else {
+    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp9_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 1;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
   }
 
-  // Set these even if already initialized.  The caller may have changed the
-  // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
-
-  cm = &ctx->pbi->common;
-
-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
-
-  if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-    set_ppflags(ctx, &flags);
-
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
   return VPX_CODEC_OK;
 }
 
-static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
-                                  void *decrypt_state,
-                                  const uint8_t *data) {
-  if (decrypt_cb) {
-    uint8_t marker;
-    decrypt_cb(decrypt_state, data, &marker, 1);
-    return marker;
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0, 0, 0};
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  // TODO(hkuang): Add worker error handling here.
+  winterface->sync(worker);
+  frame_worker_data->received_frame = 0;
+  ++ctx->available_threads;
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
   }
-  return *data;
-}
-
-static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
-                                              size_t data_sz,
-                                              uint32_t sizes[8], int *count,
-                                              vpx_decrypt_cb decrypt_cb,
-                                              void *decrypt_state) {
-  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
-  // it is a super frame index. If the last byte of real video compression
-  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
-  // not the associated matching marker byte at the front of the index we have
-  // an invalid bitstream and need to return an error.
-
-  uint8_t marker;
-
-  assert(data_sz);
-  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
-  *count = 0;
-
-  if ((marker & 0xe0) == 0xc0) {
-    const uint32_t frames = (marker & 0x7) + 1;
-    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
-
-    // This chunk is marked as having a superframe index but doesn't have
-    // enough data for it, thus it's an invalid superframe index.
-    if (data_sz < index_sz)
-      return VPX_CODEC_CORRUPT_FRAME;
-
-    {
-      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
-                                          data + data_sz - index_sz);
-
-      // This chunk is marked as having a superframe index but doesn't have
-      // the matching marker byte at the front of the index therefore it's an
-      // invalid chunk.
-      if (marker != marker2)
-        return VPX_CODEC_CORRUPT_FRAME;
-    }
-
-    {
-      // Found a valid superframe index.
-      uint32_t i, j;
-      const uint8_t *x = &data[data_sz - index_sz + 1];
-
-      // Frames has a maximum of 8 and mag has a maximum of 4.
-      uint8_t clear_buffer[32];
-      assert(sizeof(clear_buffer) >= frames * mag);
-      if (decrypt_cb) {
-        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
-        x = clear_buffer;
-      }
-
-      for (i = 0; i < frames; ++i) {
-        uint32_t this_sz = 0;
-
-        for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
-        sizes[i] = this_sz;
-      }
-      *count = frames;
-    }
-  }
-  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
@@ -424,8 +597,15 @@
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                               ctx->decrypt_cb, ctx->decrypt_state);
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
+  res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK)
     return res;
 
@@ -440,30 +620,46 @@
       for (i = 0; i < frame_count; ++i) {
         const uint8_t *data_start_copy = data_start;
         const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
         res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
                          deadline);
         if (res != VPX_CODEC_OK)
           return res;
-
         data_start += frame_size;
       }
     } else {
-      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
       if (res != VPX_CODEC_OK)
         return res;
-
-      // Extra data detected after the frame.
-      if (data_start < data_end - 1) {
-        ctx->base.err_detail = "Fail to decode frame in parallel mode";
-        return VPX_CODEC_INCAPABLE;
-      }
     }
   } else {
     // Decode in serial mode.
@@ -476,7 +672,7 @@
         vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
@@ -507,24 +703,89 @@
     }
   }
 
-  return VPX_CODEC_OK;
+  return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
 }
 
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
-      img = &ctx->img;
-      *iter = img;
-    }
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return NULL;
   }
-  ctx->img_avail = 0;
 
-  return img;
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    if (ctx->need_resync)
+      return NULL;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp9_ppflags_t flags = {0, 0, 0};
+      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+      VPxWorker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        set_ppflags(ctx, &flags);
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          release_last_output_frame(ctx);
+          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          if (ctx->need_resync)
+            return NULL;
+          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          return img;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1)
+          return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
 }
 
 static vpx_codec_err_t decoder_set_fb_fn(
@@ -533,7 +794,7 @@
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -549,12 +810,19 @@
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -565,13 +833,19 @@
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
     YV12_BUFFER_CONFIG sd;
-
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(frame_worker_data->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -582,10 +856,18 @@
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
-  if (data) {
-    YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx);
-    if (fb == NULL) return VPX_CODEC_ERROR;
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
 
+  if (data) {
+    YV12_BUFFER_CONFIG* fb;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
@@ -623,48 +905,123 @@
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
-      return VPX_CODEC_ERROR;
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
   }
-}
 
+  if (update_info) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
 
 static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
-  if (corrupted != NULL && ctx->pbi != NULL) {
-    const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show;
-    if (frame == NULL) return VPX_CODEC_ERROR;
-    *corrupted = frame->corrupted;
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+        return VPX_CODEC_ERROR;
+      if (ctx->last_show_frame >= 0)
+        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
   }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
                                              va_list args) {
   int *const display_size = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
       display_size[0] = cm->display_width;
       display_size[1] = cm->display_height;
+      return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
     }
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->bit_depth;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -681,6 +1038,42 @@
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return VPX_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data =
+        (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,            ctrl_copy_reference},
 
@@ -693,12 +1086,16 @@
   {VP8_SET_DBG_DISPLAY_MV,        ctrl_set_dbg_options},
   {VP9_INVERT_TILE_DECODE_ORDER,  ctrl_set_invert_tile_order},
   {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
+  {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
+  {VP9_SET_SKIP_LOOP_FILTER,      ctrl_set_skip_loop_filter},
 
   // Getters
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
   {VP8D_GET_FRAME_CORRUPTED,      ctrl_get_frame_corrupted},
   {VP9_GET_REFERENCE,             ctrl_get_reference},
   {VP9D_GET_DISPLAY_SIZE,         ctrl_get_display_size},
+  {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
+  {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
 
   { -1, NULL},
 };
@@ -723,12 +1120,12 @@
   },
   { // NOLINT
     0,
-    NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
-    NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_global_headers_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL,  // vpx_codec_enc_cfg_map_t
+    NULL,  // vpx_codec_encode_fn_t
+    NULL,  // vpx_codec_get_cx_data_fn_t
+    NULL,  // vpx_codec_enc_config_set_fn_t
+    NULL,  // vpx_codec_get_global_headers_fn_t
+    NULL,  // vpx_codec_get_preview_frame_fn_t
+    NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };

diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h
index fc98b62..58bb7d5 100644
--- a/libvpx/vp9/vp9_iface_common.h
+++ b/libvpx/vp9/vp9_iface_common.h

@@ -10,17 +10,17 @@
 #ifndef VP9_VP9_IFACE_COMMON_H_
 #define VP9_VP9_IFACE_COMMON_H_
 
+#include "vpx_ports/mem.h"
+
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
                             void *user_priv) {
   /** vpx_img_wrap() doesn't allow specifying independent strides for
     * the Y, U, and V planes, nor other alignment adjustments that
     * might be representable by a YV12_BUFFER_CONFIG, so we just
     * initialize all the fields.*/
-  const int ss_x = yv12->uv_crop_width < yv12->y_crop_width;
-  const int ss_y = yv12->uv_crop_height < yv12->y_crop_height;
   int bps;
-  if (!ss_y) {
-    if (!ss_x) {
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
       img->fmt = VPX_IMG_FMT_I444;
       bps = 24;
     } else {
@@ -28,16 +28,22 @@
       bps = 16;
     }
   } else {
-    img->fmt = VPX_IMG_FMT_I420;
-    bps = 12;
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = VPX_IMG_FMT_I420;
+      bps = 12;
+    }
   }
+  img->cs = yv12->color_space;
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
-  img->x_chroma_shift = ss_x;
-  img->y_chroma_shift = ss_y;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
@@ -46,6 +52,22 @@
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
   img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -68,11 +90,40 @@
                                             : yv12->y_width;
   yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
                                              : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border  = (yv12->y_stride - img->w) / 2;
+#else
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
   return VPX_CODEC_OK;
 }
 

diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index dc46c4e..84b12d7 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk

@@ -17,6 +17,7 @@
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
+VP9_CX_SRCS-yes += encoder/vp9_avg.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
@@ -29,13 +30,11 @@
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_ethread.h
+VP9_CX_SRCS-yes += encoder/vp9_ethread.c
 VP9_CX_SRCS-yes += encoder/vp9_extend.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
@@ -53,7 +52,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
-VP9_CX_SRCS-yes += encoder/vp9_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_encoder.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
@@ -63,7 +61,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_rd.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
-VP9_CX_SRCS-yes += encoder/vp9_sad.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@@ -73,17 +70,18 @@
 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
-VP9_CX_SRCS-yes += encoder/vp9_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
@@ -93,46 +91,47 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif
 
 ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
+endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+endif
+
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
+
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 1fcb36f..0e9cf16 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk

@@ -21,14 +21,10 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
-VP9_DX_SRCS-yes += decoder/vp9_reader.h
-VP9_DX_SRCS-yes += decoder/vp9_reader.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
+VP9_DX_SRCS-yes += decoder/vp9_dthread.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c

diff --git a/libvpx/vpx/exports_dec b/libvpx/vpx/exports_dec
index 3ce1499..c694eba 100644
--- a/libvpx/vpx/exports_dec
+++ b/libvpx/vpx/exports_dec

@@ -1,10 +1,8 @@
 text vpx_codec_dec_init_ver
 text vpx_codec_decode
 text vpx_codec_get_frame
-text vpx_codec_get_mem_map
 text vpx_codec_get_stream_info
 text vpx_codec_peek_stream_info
 text vpx_codec_register_put_frame_cb
 text vpx_codec_register_put_slice_cb
 text vpx_codec_set_frame_buffer_functions
-text vpx_codec_set_mem_map

diff --git a/libvpx/vpx/exports_enc b/libvpx/vpx/exports_enc
index 07f0280..e4707ba 100644
--- a/libvpx/vpx/exports_enc
+++ b/libvpx/vpx/exports_enc

@@ -1,5 +1,6 @@
 text vpx_codec_enc_config_default
 text vpx_codec_enc_config_set
+text vpx_codec_enc_init_multi_ver
 text vpx_codec_enc_init_ver
 text vpx_codec_encode
 text vpx_codec_get_cx_data
@@ -8,17 +9,7 @@
 text vpx_codec_set_cx_data_buf
 text vpx_svc_dump_statistics
 text vpx_svc_encode
-text vpx_svc_get_buffer
-text vpx_svc_get_encode_frame_count
-text vpx_svc_get_frame_size
 text vpx_svc_get_message
 text vpx_svc_init
-text vpx_svc_is_keyframe
 text vpx_svc_release
-text vpx_svc_set_keyframe
 text vpx_svc_set_options
-text vpx_svc_set_quantizers
-text vpx_svc_set_scale_factors
-text vpx_svc_get_layer_resolution
-text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer

diff --git a/libvpx/vpx/internal/vpx_codec_internal.h b/libvpx/vpx/internal/vpx_codec_internal.h
index a7716d1..7380fcc 100644
--- a/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/libvpx/vpx/internal/vpx_codec_internal.h

@@ -286,8 +286,6 @@
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-#define NOT_IMPLEMENTED 0
-
 /*!\brief Decoder algorithm interface interface
  *
  * All decoders \ref MUST expose a variable of this type.
@@ -337,9 +335,6 @@
  * and the pointer cast to the proper type.
  */
 struct vpx_codec_priv {
-  unsigned int                    sz;
-  vpx_codec_iface_t              *iface;
-  struct vpx_codec_alg_priv      *alg_priv;
   const char                     *err_detail;
   vpx_codec_flags_t               init_flags;
   struct {
@@ -347,8 +342,7 @@
     vpx_codec_priv_cb_pair_t    put_slice_cb;
   } dec;
   struct {
-    int                         tbd;
-    struct vpx_fixed_buf        cx_data_dst_buf;
+    vpx_fixed_buf_t             cx_data_dst_buf;
     unsigned int                cx_data_pad_before;
     unsigned int                cx_data_pad_after;
     vpx_codec_cx_pkt_t          cx_data_pkt;
@@ -431,10 +425,18 @@
   jmp_buf          jmp;
 };
 
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
                         vpx_codec_err_t                 error,
                         const char                     *fmt,
-                        ...);
+                        ...) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vpx/src/svc_encodeframe.c b/libvpx/vpx/src/svc_encodeframe.c
index 7828615..9844ace 100644
--- a/libvpx/vpx/src/svc_encodeframe.c
+++ b/libvpx/vpx/src/svc_encodeframe.c

@@ -15,12 +15,12 @@
 
 #include <assert.h>
 #include <math.h>
+#include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
@@ -44,11 +44,32 @@
 #define SVC_REFERENCE_FRAMES 8
 #define SUPERFRAME_SLOTS (8)
 #define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
-#define OPTION_BUFFER_SIZE 256
-#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
 
-static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
-static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
+#define MAX_QUANTIZER 63
+
+static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = {
+  4, 5, 7, 11, 16
+};
+
+static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = {
+  16, 16, 16, 16, 16
+};
+
+typedef enum {
+  QUANTIZER = 0,
+  BITRATE,
+  SCALE_FACTOR,
+  AUTO_ALT_REF,
+  ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const int option_max_values[ALL_OPTION_TYPES] = {
+  63, INT_MAX, INT_MAX, 1
+};
+
+static const int option_min_values[ALL_OPTION_TYPES] = {
+  0, 0, 1, 0
+};
 
 // One encoded frame
 typedef struct FrameData {
@@ -58,113 +79,26 @@
   struct FrameData         *next;
 } FrameData;
 
-typedef struct SvcInternal {
-  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
-  char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
-  char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors
-
-  // values extracted from option, quantizers
-  int scaling_factor_num[VPX_SS_MAX_LAYERS];
-  int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int quantizer[VPX_SS_MAX_LAYERS];
-  int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-
-  // accumulated statistics
-  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
-  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
-  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
-
-  // codec encoding values
-  int width;    // width of highest layer
-  int height;   // height of highest layer
-  int kf_dist;  // distance between keyframes
-
-  // state variables
-  int encode_frame_count;
-  int frame_received;
-  int frame_within_gop;
-  int layers;
-  int layer;
-  int is_keyframe;
-
-  FrameData *frame_list;
-  FrameData *frame_temp;
-
-  char *rc_stats_buf;
-  size_t rc_stats_buf_size;
-  size_t rc_stats_buf_used;
-
-  char message_buffer[2048];
-  vpx_codec_ctx_t *codec_ctx;
-} SvcInternal;
-
-// create FrameData from encoder output
-static struct FrameData *fd_create(void *buf, size_t size,
-                                   vpx_codec_frame_flags_t flags) {
-  struct FrameData *const frame_data =
-      (struct FrameData *)vpx_malloc(sizeof(*frame_data));
-  if (frame_data == NULL) {
-    return NULL;
-  }
-  frame_data->buf = vpx_malloc(size);
-  if (frame_data->buf == NULL) {
-    vpx_free(frame_data);
-    return NULL;
-  }
-  vpx_memcpy(frame_data->buf, buf, size);
-  frame_data->size = size;
-  frame_data->flags = flags;
-  return frame_data;
-}
-
-// free FrameData
-static void fd_free(struct FrameData *p) {
-  if (p) {
-    if (p->buf)
-      vpx_free(p->buf);
-    vpx_free(p);
-  }
-}
-
-// add FrameData to list
-static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
-  struct FrameData **p = list;
-
-  while (*p != NULL) p = &(*p)->next;
-  *p = layer_data;
-  layer_data->next = NULL;
-}
-
-// free FrameData list
-static void fd_free_list(struct FrameData *list) {
-  struct FrameData *p = list;
-
-  while (p) {
-    list = list->next;
-    fd_free(p);
-    p = list;
-  }
-}
-
-static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
+static SvcInternal_t *get_svc_internal(SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
   if (svc_ctx->internal == NULL) {
-    SvcInternal *const si = (SvcInternal *)malloc(sizeof(*si));
+    SvcInternal_t *const si = (SvcInternal_t *)malloc(sizeof(*si));
     if (si != NULL) {
       memset(si, 0, sizeof(*si));
     }
     svc_ctx->internal = si;
   }
-  return (SvcInternal *)svc_ctx->internal;
+  return (SvcInternal_t *)svc_ctx->internal;
 }
 
-static const SvcInternal *get_const_svc_internal(const SvcContext *svc_ctx) {
+static const SvcInternal_t *get_const_svc_internal(
+    const SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
-  return (const SvcInternal *)svc_ctx->internal;
+  return (const SvcInternal_t *)svc_ctx->internal;
 }
 
 static void svc_log_reset(SvcContext *svc_ctx) {
-  SvcInternal *const si = (SvcInternal *)svc_ctx->internal;
+  SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
   si->message_buffer[0] = '\0';
 }
 
@@ -173,7 +107,7 @@
   char buf[512];
   int retval = 0;
   va_list ap;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
 
   if (level > svc_ctx->log_level) {
     return retval;
@@ -196,158 +130,63 @@
   return retval;
 }
 
-static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0;
-  int i, q;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type,
+                                      char *input,
+                                      int *value0,
+                                      int *value1) {
+  if (type == SCALE_FACTOR) {
+    *value0 = strtol(input, &input, 10);
+    if (*input++ != '/')
+      return VPX_CODEC_INVALID_PARAM;
+    *value1 = strtol(input, &input, 10);
 
-  if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
-    input_string = strdup(DEFAULT_QUANTIZER_VALUES);
+    if (*value0 < option_min_values[SCALE_FACTOR] ||
+        *value1 < option_min_values[SCALE_FACTOR] ||
+        *value0 > option_max_values[SCALE_FACTOR] ||
+        *value1 > option_max_values[SCALE_FACTOR] ||
+        *value0 > *value1)  // num shouldn't be greater than den
+      return VPX_CODEC_INVALID_PARAM;
   } else {
-    input_string = strdup(quantizer_values);
+    *value0 = atoi(input);
+    if (*value0 < option_min_values[type] ||
+        *value0 > option_max_values[type])
+      return VPX_CODEC_INVALID_PARAM;
   }
-
-  token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    if (token != NULL) {
-      q = atoi(token);
-      if (q <= 0 || q > 100) {
-        svc_log(svc_ctx, SVC_LOG_ERROR,
-                "svc-quantizer-values: invalid value %s\n", token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
-    } else {
-      q = 0;
-    }
-    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
-  }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: quantizers: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  free(input_string);
-  return res;
+  return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t parse_auto_alt_ref(SvcContext *svc_ctx,
-                                          const char *alt_ref_options) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0, enabled = 0;
-  int i, value;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-
-  if (alt_ref_options == NULL || strlen(alt_ref_options) == 0) {
-    return VPX_CODEC_INVALID_PARAM;
-  } else {
-    input_string = strdup(alt_ref_options);
-  }
-
-  token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    if (token != NULL) {
-      value = atoi(token);
-      if (value < 0 || value > 1) {
-        svc_log(svc_ctx, SVC_LOG_ERROR,
-                "enable auto alt ref values: invalid value %s\n", token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
-    } else {
-      value = 0;
-    }
-    si->enable_auto_alt_ref[i] = value;
-    if (value > 0)
-      ++enabled;
-  }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: quantizers: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  if (enabled > REF_FRAMES - svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
-            REF_FRAMES - svc_ctx->spatial_layers, enabled);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  free(input_string);
-  return res;
-}
-
-static void log_invalid_scale_factor(SvcContext *svc_ctx, const char *value) {
-  svc_log(svc_ctx, SVC_LOG_ERROR, "svc scale-factors: invalid value %s\n",
-          value);
-}
-
-static vpx_codec_err_t parse_scale_factors(SvcContext *svc_ctx,
-                                           const char *scale_factors) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0;
+static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
+                                                       LAYER_OPTION_TYPE type,
+                                                       const char *input,
+                                                       int *option0,
+                                                       int *option1) {
   int i;
-  int64_t num, den;
   vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  char *save_ptr;
 
-  if (scale_factors == NULL || strlen(scale_factors) == 0) {
-    input_string = strdup(DEFAULT_SCALE_FACTORS);
-  } else {
-    input_string = strdup(scale_factors);
-  }
+  if (input == NULL || option0 == NULL ||
+      (option1 == NULL && type == SCALE_FACTOR))
+    return VPX_CODEC_INVALID_PARAM;
+
+  input_string = strdup(input);
   token = strtok_r(input_string, delim, &save_ptr);
   for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    num = den = 0;
     if (token != NULL) {
-      num = strtol(token, &token, 10);
-      if (num <= 0) {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
+      res = extract_option(type, token, option0 + i, option1 + i);
+      if (res != VPX_CODEC_OK)
         break;
-      }
-      if (*token++ != '/') {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      den = strtol(token, &token, 10);
-      if (den <= 0) {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
       token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
+    } else {
+      break;
     }
-    si->scaling_factor_num[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
-        (int)num;
-    si->scaling_factor_den[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
-        (int)den;
   }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
+  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: scale-factors: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
+            "svc: layer params type: %d    %d values required, "
+            "but only %d specified\n", type, svc_ctx->spatial_layers, i);
     res = VPX_CODEC_INVALID_PARAM;
   }
   free(input_string);
@@ -366,7 +205,9 @@
   char *option_name;
   char *option_value;
   char *input_ptr;
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
+  int i, alt_ref_enabled = 0;
 
   if (options == NULL) return VPX_CODEC_OK;
   input_string = strdup(options);
@@ -382,17 +223,35 @@
       res = VPX_CODEC_INVALID_PARAM;
       break;
     }
-    if (strcmp("layers", option_name) == 0) {
+    if (strcmp("spatial-layers", option_name) == 0) {
       svc_ctx->spatial_layers = atoi(option_value);
+    } else if (strcmp("temporal-layers", option_name) == 0) {
+      svc_ctx->temporal_layers = atoi(option_value);
     } else if (strcmp("scale-factors", option_name) == 0) {
-      res = parse_scale_factors(svc_ctx, option_value);
+      res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR, option_value,
+                                            si->svc_params.scaling_factor_num,
+                                            si->svc_params.scaling_factor_den);
       if (res != VPX_CODEC_OK) break;
-    } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value);
+    } else if (strcmp("max-quantizers", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+                                            si->svc_params.max_quantizers,
+                                            NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("min-quantizers", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+                                            si->svc_params.min_quantizers,
+                                            NULL);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("auto-alt-refs", option_name) == 0) {
-      res = parse_auto_alt_ref(svc_ctx, option_value);
+      res = parse_layer_options_from_string(svc_ctx, AUTO_ALT_REF, option_value,
+                                            si->enable_auto_alt_ref, NULL);
       if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("bitrates", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, BITRATE, option_value,
+                                            si->bitrates, NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+      si->use_multiple_frame_contexts = atoi(option_value);
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -401,11 +260,36 @@
     option_name = strtok_r(NULL, "=", &input_ptr);
   }
   free(input_string);
+
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    if (si->svc_params.max_quantizers[i] > MAX_QUANTIZER ||
+        si->svc_params.max_quantizers[i] < 0 ||
+        si->svc_params.min_quantizers[i] > si->svc_params.max_quantizers[i] ||
+        si->svc_params.min_quantizers[i] < 0)
+      res = VPX_CODEC_INVALID_PARAM;
+  }
+
+  if (si->use_multiple_frame_contexts &&
+      (svc_ctx->spatial_layers > 3 ||
+       svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4))
+    res = VPX_CODEC_INVALID_PARAM;
+
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
+    alt_ref_enabled += si->enable_auto_alt_ref[i];
+  if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
+            "enabled auto alt reference frame, but % layers are enabled\n",
+            REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+
   return res;
 }
 
-vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx,
+                                    const char *options) {
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -414,26 +298,86 @@
   return VPX_CODEC_OK;
 }
 
-vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
-  return VPX_CODEC_OK;
-}
+void assign_layer_bitrates(const SvcContext *svc_ctx,
+                           vpx_codec_enc_cfg_t *const enc_cfg) {
+  int i;
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
+  int sl, tl, spatial_layer_target;
 
-vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx,
-                                          const char *scale_factors) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || scale_factors == NULL || si == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
+  if (svc_ctx->temporal_layering_mode != 0) {
+    if (si->bitrates[0] != 0) {
+      enc_cfg->rc_target_bitrate = 0;
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers] = 0;
+        for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+          enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers]
+              += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
+          enc_cfg->layer_target_bitrate[sl*svc_ctx->temporal_layers + tl]
+              = si->bitrates[sl * svc_ctx->temporal_layers + tl];
+        }
+      }
+    } else {
+      float total = 0;
+      float alloc_ratio[VPX_MAX_LAYERS] = {0};
+
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        if (si->svc_params.scaling_factor_den[sl] > 0) {
+          alloc_ratio[sl] = (float)(si->svc_params.scaling_factor_num[sl] *
+              1.0 / si->svc_params.scaling_factor_den[sl]);
+          total += alloc_ratio[sl];
+        }
+      }
+
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
+            (unsigned int)(enc_cfg->rc_target_bitrate *
+                alloc_ratio[sl] / total);
+        if (svc_ctx->temporal_layering_mode == 3) {
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
+              spatial_layer_target >> 1;
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
+              (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] =
+              spatial_layer_target;
+        } else if (svc_ctx->temporal_layering_mode == 2) {
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
+              spatial_layer_target * 2 / 3;
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
+              spatial_layer_target;
+        } else {
+          // User should explicitly assign bitrates in this case.
+          assert(0);
+        }
+      }
+    }
+  } else {
+    if (si->bitrates[0] != 0) {
+      enc_cfg->rc_target_bitrate = 0;
+      for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+        enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
+        enc_cfg->rc_target_bitrate += si->bitrates[i];
+      }
+    } else {
+      float total = 0;
+      float alloc_ratio[VPX_MAX_LAYERS] = {0};
+
+      for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+        if (si->svc_params.scaling_factor_den[i] > 0) {
+          alloc_ratio[i] = (float)(si->svc_params.scaling_factor_num[i] * 1.0 /
+                                   si->svc_params.scaling_factor_den[i]);
+
+          alloc_ratio[i] *= alloc_ratio[i];
+          total += alloc_ratio[i];
+        }
+      }
+      for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+        if (total > 0) {
+          enc_cfg->layer_target_bitrate[i] = (unsigned int)
+              (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
+        }
+      }
+    }
   }
-  strncpy(si->scale_factors, scale_factors, sizeof(si->scale_factors));
-  si->scale_factors[sizeof(si->scale_factors) - 1] = '\0';
-  return VPX_CODEC_OK;
 }
 
 vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
@@ -441,7 +385,7 @@
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
   int i;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
     return VPX_CODEC_INVALID_PARAM;
@@ -469,72 +413,80 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers);
-  if (res != VPX_CODEC_OK) return res;
+  // Note: temporal_layering_mode only applies to one-pass CBR
+  // si->svc_params.temporal_layering_mode = svc_ctx->temporal_layering_mode;
+  if (svc_ctx->temporal_layering_mode == 3) {
+    svc_ctx->temporal_layers = 3;
+  } else if (svc_ctx->temporal_layering_mode == 2) {
+    svc_ctx->temporal_layers = 2;
+  }
 
-  res = parse_scale_factors(svc_ctx, si->scale_factors);
-  if (res != VPX_CODEC_OK) return res;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+    si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
+    si->svc_params.min_quantizers[i] = 0;
+    si->svc_params.scaling_factor_num[i] = DEFAULT_SCALE_FACTORS_NUM[i];
+    si->svc_params.scaling_factor_den[i] = DEFAULT_SCALE_FACTORS_DEN[i];
+  }
 
   // Parse aggregate command line options. Options must start with
   // "layers=xx" then followed by other options
   res = parse_options(svc_ctx, si->options);
   if (res != VPX_CODEC_OK) return res;
 
-  si->layers = svc_ctx->spatial_layers;
+  if (svc_ctx->spatial_layers < 1)
+    svc_ctx->spatial_layers = 1;
+  if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS)
+    svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS;
 
-  // Assign target bitrate for each layer. We calculate the ratio
-  // from the resolution for now.
-  // TODO(Minghai): Optimize the mechanism of allocating bits after
-  // implementing svc two pass rate control.
-  if (si->layers > 1) {
-    float total = 0;
-    float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
+  if (svc_ctx->temporal_layers < 1)
+    svc_ctx->temporal_layers = 1;
+  if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
+    svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
 
-    assert(si->layers <= VPX_SS_MAX_LAYERS);
-    for (i = 0; i < si->layers; ++i) {
-      int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers;
-      if (pos < VPX_SS_MAX_LAYERS && si->scaling_factor_den[pos] > 0) {
-        alloc_ratio[i] = (float)(si->scaling_factor_num[pos] * 1.0 /
-            si->scaling_factor_den[pos]);
-
-        alloc_ratio[i] *= alloc_ratio[i];
-        total += alloc_ratio[i];
-      }
-    }
-
-    for (i = 0; i < si->layers; ++i) {
-      if (total > 0) {
-        enc_cfg->ss_target_bitrate[i] = (unsigned int)
-            (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
-      }
-    }
+  if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
+      svc_log(svc_ctx, SVC_LOG_ERROR,
+          "spatial layers * temporal layers exceeds the maximum number of "
+          "allowed layers of %d\n",
+          svc_ctx->spatial_layers * svc_ctx->temporal_layers,
+          (int) VPX_MAX_LAYERS);
+      return VPX_CODEC_INVALID_PARAM;
   }
+  assign_layer_bitrates(svc_ctx, enc_cfg);
 
 #if CONFIG_SPATIAL_SVC
-  for (i = 0; i < si->layers; ++i)
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
     enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
 #endif
 
-  // modify encoder configuration
-  enc_cfg->ss_number_layers = si->layers;
-  enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
-
-  // TODO(ivanmaltz): determine if these values need to be set explicitly for
-  // svc, or if the normal default/override mechanism can be used
-  enc_cfg->rc_dropframe_thresh = 0;
-  enc_cfg->rc_resize_allowed = 0;
-
-  if (enc_cfg->g_pass == VPX_RC_ONE_PASS) {
-    enc_cfg->rc_min_quantizer = 33;
-    enc_cfg->rc_max_quantizer = 33;
+  if (svc_ctx->temporal_layers > 1) {
+    int i;
+    for (i = 0; i < svc_ctx->temporal_layers; ++i) {
+      enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate /
+                                      svc_ctx->temporal_layers;
+      enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i);
+    }
   }
 
-  enc_cfg->rc_undershoot_pct = 100;
-  enc_cfg->rc_overshoot_pct = 15;
-  enc_cfg->rc_buf_initial_sz = 500;
-  enc_cfg->rc_buf_optimal_sz = 600;
-  enc_cfg->rc_buf_sz = 1000;
-  enc_cfg->g_error_resilient = 1;
+  if (svc_ctx->threads)
+    enc_cfg->g_threads = svc_ctx->threads;
+
+  // Modify encoder configuration
+  enc_cfg->ss_number_layers = svc_ctx->spatial_layers;
+  enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
+
+  if (enc_cfg->rc_end_usage == VPX_CBR) {
+    enc_cfg->rc_resize_allowed = 0;
+    enc_cfg->rc_min_quantizer = 2;
+    enc_cfg->rc_max_quantizer = 63;
+    enc_cfg->rc_undershoot_pct = 50;
+    enc_cfg->rc_overshoot_pct = 50;
+    enc_cfg->rc_buf_initial_sz = 20;
+    enc_cfg->rc_buf_optimal_sz = 600;
+    enc_cfg->rc_buf_sz = 1000;
+  }
+
+  if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+    enc_cfg->g_error_resilient = 1;
 
   // Initialize codec
   res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
@@ -544,109 +496,29 @@
   }
 
   vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
-  vpx_codec_control(codec_ctx, VP8E_SET_TOKEN_PARTITIONS, 1);
+  vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &si->svc_params);
 
   return VPX_CODEC_OK;
 }
 
-vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
-                                             int layer,
-                                             unsigned int *width,
-                                             unsigned int *height) {
-  int w, h, index, num, den;
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-
-  if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  if (layer < 0 || layer >= si->layers) return VPX_CODEC_INVALID_PARAM;
-
-  index = layer + VPX_SS_MAX_LAYERS - si->layers;
-  num = si->scaling_factor_num[index];
-  den = si->scaling_factor_den[index];
-  if (num == 0 || den == 0) return VPX_CODEC_INVALID_PARAM;
-
-  w = si->width * num / den;
-  h = si->height * num / den;
-
-  // make height and width even to make chrome player happy
-  w += w % 2;
-  h += h % 2;
-
-  *width = w;
-  *height = h;
-
-  return VPX_CODEC_OK;
-}
-
-static void set_svc_parameters(SvcContext *svc_ctx,
-                               vpx_codec_ctx_t *codec_ctx) {
-  int layer, layer_index;
-  vpx_svc_parameters_t svc_params;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-
-  memset(&svc_params, 0, sizeof(svc_params));
-  svc_params.temporal_layer = 0;
-  svc_params.spatial_layer = si->layer;
-
-  layer = si->layer;
-  if (VPX_CODEC_OK != vpx_svc_get_layer_resolution(svc_ctx, layer,
-                                                   &svc_params.width,
-                                                   &svc_params.height)) {
-    svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
-  }
-  layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
-
-  if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
-    svc_params.min_quantizer = si->quantizer[layer_index];
-    svc_params.max_quantizer = si->quantizer[layer_index];
-  } else {
-    svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
-    svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
-  }
-
-  svc_params.distance_from_i_frame = si->frame_within_gop;
-  vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &svc_params);
-}
-
 /**
  * Encode a frame into multiple layers
  * Create a superframe containing the individual layers
  */
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
-                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
                                int64_t duration, int deadline) {
   vpx_codec_err_t res;
   vpx_codec_iter_t iter;
   const vpx_codec_cx_pkt_t *cx_pkt;
-  int layer_for_psnr = 0;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
 
   svc_log_reset(svc_ctx);
-  si->rc_stats_buf_used = 0;
-
-  si->layers = svc_ctx->spatial_layers;
-  if (si->encode_frame_count == 0) {
-    si->frame_within_gop = 0;
-  }
-  si->is_keyframe = (si->frame_within_gop == 0);
-
-  if (rawimg != NULL) {
-    svc_log(svc_ctx, SVC_LOG_DEBUG,
-            "vpx_svc_encode  layers: %d, frame_count: %d, "
-            "frame_within_gop: %d\n", si->layers, si->encode_frame_count,
-            si->frame_within_gop);
-  }
-
-  if (rawimg != NULL) {
-    // encode each layer
-    for (si->layer = 0; si->layer < si->layers; ++si->layer) {
-      set_svc_parameters(svc_ctx, codec_ctx);
-    }
-  }
 
   res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0,
                          deadline);
@@ -657,129 +529,61 @@
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-      case VPX_CODEC_CX_FRAME_PKT: {
-        fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
-                                               cx_pkt->data.frame.sz,
-                                               cx_pkt->data.frame.flags));
-
-        svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
-                "pts: %d\n", si->frame_received,
-                (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
-                (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-
-        ++si->frame_received;
-        layer_for_psnr = 0;
-        break;
-      }
-      case VPX_CODEC_PSNR_PKT: {
-        int i;
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
-                cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
-                cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
-        for (i = 0; i < COMPONENTS; i++) {
-          si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
-          si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
-        }
-        ++layer_for_psnr;
-        break;
-      }
-      case VPX_CODEC_STATS_PKT: {
-        size_t new_size = si->rc_stats_buf_used +
-            cx_pkt->data.twopass_stats.sz;
-
-        if (new_size > si->rc_stats_buf_size) {
-          char *p = (char*)realloc(si->rc_stats_buf, new_size);
-          if (p == NULL) {
-            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
-            return VPX_CODEC_MEM_ERROR;
-          }
-          si->rc_stats_buf = p;
-          si->rc_stats_buf_size = new_size;
-        }
-
-        memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
-               cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
-        si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
-        break;
-      }
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
 #if CONFIG_SPATIAL_SVC
+      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
+        int i;
+        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+          int j;
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].psnr[0],
+                  cx_pkt->data.layer_psnr[i].psnr[1],
+                  cx_pkt->data.layer_psnr[i].psnr[2],
+                  cx_pkt->data.layer_psnr[i].psnr[3]);
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].sse[0],
+                  cx_pkt->data.layer_psnr[i].sse[1],
+                  cx_pkt->data.layer_psnr[i].sse[2],
+                  cx_pkt->data.layer_psnr[i].sse[3]);
+
+          for (j = 0; j < COMPONENTS; ++j) {
+            si->psnr_sum[i][j] +=
+                cx_pkt->data.layer_psnr[i].psnr[j];
+            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
+          }
+        }
+        ++si->psnr_pkt_received;
+        break;
+      }
       case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
         int i;
-        for (i = 0; i < si->layers; ++i)
+        for (i = 0; i < svc_ctx->spatial_layers; ++i)
           si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
         break;
       }
 #endif
+#endif
       default: {
         break;
       }
     }
   }
 
-  if (rawimg != NULL) {
-    ++si->frame_within_gop;
-    ++si->encode_frame_count;
-  }
-
   return VPX_CODEC_OK;
 }
 
 const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
   return si->message_buffer;
 }
 
-// We will maintain a list of output frame buffers since with lag_in_frame
-// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
-// remove a frame buffer from the list the put it to a temporal pointer, which
-// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
-void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
-
-  if (si->frame_temp)
-    fd_free(si->frame_temp);
-
-  si->frame_temp = si->frame_list;
-  si->frame_list = si->frame_list->next;
-
-  return si->frame_temp->buf;
-}
-
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return si->frame_list->size;
-}
-
-int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->encode_frame_count;
-}
-
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
-}
-
-void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return;
-  si->frame_within_gop = 0;
-}
-
 static double calc_psnr(double d) {
   if (d == 0) return 100;
   return -10.0 * log(d) / log(10.0);
@@ -787,7 +591,7 @@
 
 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames, encode_frame_count;
+  int number_of_frames;
   int i, j;
   uint32_t bytes_total = 0;
   double scale[COMPONENTS];
@@ -795,17 +599,16 @@
   double mse[COMPONENTS];
   double y_scale;
 
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
 
   svc_log_reset(svc_ctx);
 
-  encode_frame_count = si->encode_frame_count;
-  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);
+  number_of_frames = si->psnr_pkt_received;
+  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
 
   svc_log(svc_ctx, SVC_LOG_INFO, "\n");
-  for (i = 0; i < si->layers; ++i) {
-    number_of_frames = encode_frame_count;
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
 
     svc_log(svc_ctx, SVC_LOG_INFO,
             "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
@@ -831,7 +634,7 @@
             mse[1], mse[2], mse[3]);
 
     bytes_total += si->bytes_sum[i];
-    // clear sums for next time
+    // Clear sums for next time.
     si->bytes_sum[i] = 0;
     for (j = 0; j < COMPONENTS; ++j) {
       si->psnr_sum[i][j] = 0;
@@ -840,39 +643,21 @@
   }
 
   // only display statistics once
-  si->encode_frame_count = 0;
+  si->psnr_pkt_received = 0;
 
   svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
   return vpx_svc_get_message(svc_ctx);
 }
 
 void vpx_svc_release(SvcContext *svc_ctx) {
-  SvcInternal *si;
+  SvcInternal_t *si;
   if (svc_ctx == NULL) return;
   // do not use get_svc_internal as it will unnecessarily allocate an
-  // SvcInternal if it was not already allocated
-  si = (SvcInternal *)svc_ctx->internal;
+  // SvcInternal_t if it was not already allocated
+  si = (SvcInternal_t *)svc_ctx->internal;
   if (si != NULL) {
-    fd_free(si->frame_temp);
-    fd_free_list(si->frame_list);
-    if (si->rc_stats_buf) {
-      free(si->rc_stats_buf);
-    }
     free(si);
     svc_ctx->internal = NULL;
   }
 }
 
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->rc_stats_buf_used;
-}
-
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->rc_stats_buf;
-}
-
-

diff --git a/libvpx/vpx/src/vpx_codec.c b/libvpx/vpx/src/vpx_codec.c
index d175eae..5a495ce 100644
--- a/libvpx/vpx/src/vpx_codec.c
+++ b/libvpx/vpx/src/vpx_codec.c

@@ -88,8 +88,7 @@
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
   else {
-    if (ctx->priv->alg_priv)
-      ctx->iface->destroy(ctx->priv->alg_priv);
+    ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv);
 
     ctx->iface = NULL;
     ctx->name = NULL;
@@ -125,7 +124,7 @@
         va_list  ap;
 
         va_start(ap, ctrl_id);
-        res = entry->fn(ctx->priv->alg_priv, ap);
+        res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
         va_end(ap);
         break;
       }

diff --git a/libvpx/vpx/src/vpx_decoder.c b/libvpx/vpx/src/vpx_decoder.c
index 4d22a08..802d8ed 100644
--- a/libvpx/vpx/src/vpx_decoder.c
+++ b/libvpx/vpx/src/vpx_decoder.c

@@ -18,9 +18,13 @@
 
 #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
 
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
 vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_dec_cfg_t  *cfg,
+                                       const vpx_codec_dec_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -54,9 +58,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -97,7 +98,7 @@
     si->w = 0;
     si->h = 0;
 
-    res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si);
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
   }
 
   return SAVE_STATUS(ctx, res);
@@ -118,8 +119,8 @@
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
   else {
-    res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz,
-                                 user_priv, deadline);
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+                                 deadline);
   }
 
   return SAVE_STATUS(ctx, res);
@@ -132,7 +133,7 @@
   if (!ctx || !iter || !ctx->iface || !ctx->priv)
     img = NULL;
   else
-    img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter);
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
 
   return img;
 }
@@ -188,7 +189,7 @@
              !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
     res = VPX_CODEC_ERROR;
   } else {
-    res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
                                     cb_priv);
   }
 

diff --git a/libvpx/vpx/src/vpx_encoder.c b/libvpx/vpx/src/vpx_encoder.c
index 6e18bd1..cd10c41 100644
--- a/libvpx/vpx/src/vpx_encoder.c
+++ b/libvpx/vpx/src/vpx_encoder.c

@@ -15,14 +15,18 @@
  */
 #include <limits.h>
 #include <string.h>
-#include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
 
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
 vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_enc_cfg_t  *cfg,
+                                       const vpx_codec_enc_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -53,9 +57,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -135,9 +136,6 @@
           }
         }
 
-        if (ctx->priv)
-          ctx->priv->iface = ctx->iface;
-
         if (res)
           break;
 
@@ -222,7 +220,7 @@
     FLOATING_POINT_INIT();
 
     if (num_enc == 1)
-      res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
                                    duration, flags, deadline);
     else {
       /* Multi-resolution encoding:
@@ -236,7 +234,7 @@
       if (img) img += num_enc - 1;
 
       for (i = num_enc - 1; i >= 0; i--) {
-        if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
                                           duration, flags, deadline)))
           break;
 
@@ -265,7 +263,7 @@
     else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter);
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
   }
 
   if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
@@ -333,7 +331,7 @@
     else if (!ctx->iface->enc.get_preview)
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      img = ctx->iface->enc.get_preview(ctx->priv->alg_priv);
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
   }
 
   return img;
@@ -351,7 +349,7 @@
     else if (!ctx->iface->enc.get_glob_hdrs)
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv);
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
   }
 
   return buf;
@@ -367,7 +365,7 @@
   else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
   else
-    res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg);
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
 
   return SAVE_STATUS(ctx, res);
 }

diff --git a/libvpx/vpx/src/vpx_image.c b/libvpx/vpx/src/vpx_image.c
index 8c7e3cf..9aae12c 100644
--- a/libvpx/vpx/src/vpx_image.c
+++ b/libvpx/vpx/src/vpx_image.c

@@ -8,49 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdlib.h>
 #include <string.h>
+
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
-#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
-
-/* Memalign code is copied from vpx_mem.c */
-static void *img_buf_memalign(size_t align, size_t size) {
-  void *addr,
-       * x = NULL;
-
-  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
-
-  if (addr) {
-    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
-    /* save the actual malloc address */
-    ((size_t *)x)[-1] = (size_t)addr;
-  }
-
-  return x;
-}
-
-static void img_buf_free(void *memblk) {
-  if (memblk) {
-    void *addr = (void *)(((size_t *)memblk)[-1]);
-    free(addr);
-  }
-}
-
-static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
-                                     vpx_img_fmt_t  fmt,
-                                     unsigned int   d_w,
-                                     unsigned int   d_h,
-                                     unsigned int   buf_align,
-                                     unsigned int   stride_align,
+static vpx_image_t *img_alloc_helper(vpx_image_t *img,
+                                     vpx_img_fmt_t fmt,
+                                     unsigned int d_w,
+                                     unsigned int d_h,
+                                     unsigned int buf_align,
+                                     unsigned int stride_align,
                                      unsigned char *img_data) {
-
-  unsigned int  h, w, s, xcs, ycs, bps;
-  int           align;
+  unsigned int h, w, s, xcs, ycs, bps;
+  unsigned int stride_in_bytes;
+  int align;
 
   /* Treat align==0 like align==1 */
   if (!buf_align)
@@ -96,6 +70,7 @@
       bps = 12;
       break;
     case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I440:
       bps = 16;
       break;
     case VPX_IMG_FMT_I444:
@@ -105,6 +80,7 @@
       bps = 24;
       break;
     case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44016:
       bps = 32;
       break;
     case VPX_IMG_FMT_I44416:
@@ -133,9 +109,12 @@
 
   switch (fmt) {
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_VPXI420:
     case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I44016:
       ycs = 1;
       break;
     default:
@@ -150,6 +129,7 @@
   h = (d_h + align) & ~align;
   s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8;
   s = (s + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
 
   /* Allocate the new image */
   if (!img) {
@@ -172,7 +152,7 @@
     if (alloc_size != (size_t)alloc_size)
       goto fail;
 
-    img->img_data = img_buf_memalign(buf_align, (size_t)alloc_size);
+    img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
     img->img_data_owner = 1;
   }
 
@@ -180,7 +160,7 @@
     goto fail;
 
   img->fmt = fmt;
-  img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
+  img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
   img->w = w;
   img->h = h;
   img->x_chroma_shift = xcs;
@@ -188,8 +168,8 @@
   img->bps = bps;
 
   /* Calculate strides */
-  img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = s;
-  img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = s >> xcs;
+  img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes;
+  img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs;
 
   /* Default viewport to entire image */
   if (!vpx_img_set_rect(img, 0, 0, d_w, d_h))
@@ -235,39 +215,40 @@
       img->planes[VPX_PLANE_PACKED] =
         img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
     } else {
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
       data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
-          data + x + y * img->stride[VPX_PLANE_ALPHA];
+            data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
         data += img->h * img->stride[VPX_PLANE_ALPHA];
       }
 
-      img->planes[VPX_PLANE_Y] = data + x + y * img->stride[VPX_PLANE_Y];
+      img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
+          y * img->stride[VPX_PLANE_Y];
       data += img->h * img->stride[VPX_PLANE_Y];
 
       if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
       } else {
-        img->planes[VPX_PLANE_V] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
         data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
-        img->planes[VPX_PLANE_U] = data
-                                   + (x >> img->x_chroma_shift)
-                                   + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
       }
     }
-
     return 0;
   }
-
   return -1;
 }
 
@@ -296,7 +277,7 @@
 void vpx_img_free(vpx_image_t *img) {
   if (img) {
     if (img->img_data && img->img_data_owner)
-      img_buf_free(img->img_data);
+      vpx_free(img->img_data);
 
     if (img->self_allocd)
       free(img);

diff --git a/libvpx/vpx/svc_context.h b/libvpx/vpx/svc_context.h
index e0de263..a09651c 100644
--- a/libvpx/vpx/svc_context.h
+++ b/libvpx/vpx/svc_context.h

@@ -31,15 +31,49 @@
 
 typedef struct {
   // public interface to svc_command options
-  int spatial_layers;               // number of layers
+  int spatial_layers;               // number of spatial layers
+  int temporal_layers;               // number of temporal layers
+  int temporal_layering_mode;
   SVC_LOG_LEVEL log_level;  // amount of information to display
   int log_print;  // when set, printf log messages instead of returning the
                   // message with svc_get_message
-
+  int output_rc_stat;  // for outputting rc stats
+  int speed;  // speed setting for codec
+  int threads;
   // private storage for vpx_svc_encode
   void *internal;
 } SvcContext;
 
+#define OPTION_BUFFER_SIZE 1024
+#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
+
+typedef struct SvcInternal {
+  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
+
+  // values extracted from option, quantizers
+  vpx_svc_extra_cfg_t svc_params;
+  int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_SS_MAX_LAYERS];
+
+  // accumulated statistics
+  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
+  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
+
+  // codec encoding values
+  int width;    // width of highest layer
+  int height;   // height of highest layer
+  int kf_dist;  // distance between keyframes
+
+  // state variables
+  int psnr_pkt_received;
+  int layer;
+  int use_multiple_frame_contexts;
+
+  char message_buffer[2048];
+  vpx_codec_ctx_t *codec_ctx;
+} SvcInternal_t;
+
 /**
  * Set SVC options
  * options are supplied as a single string separated by spaces
@@ -51,32 +85,19 @@
 vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
 
 /**
- * Set SVC quantizer values
- * values comma separated, ordered from lowest resolution to highest
- * e.g., "60,53,39,33,27"
- */
-vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizer_values);
-
-/**
- * Set SVC scale factors
- * values comma separated, ordered from lowest resolution to highest
- * e.g.,  "4/16,5/16,7/16,11/16,16/16"
- */
-vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx,
-                                          const char *scale_factors);
-
-/**
  * initialize SVC encoding
  */
-vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx,
+                             vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *cfg);
 /**
  * encode a frame of video with multiple layers
  */
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
-                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
                                int64_t duration, int deadline);
 
 /**
@@ -94,51 +115,6 @@
  */
 const char *vpx_svc_get_message(const SvcContext *svc_ctx);
 
-/**
- * return size of encoded data to be returned by vpx_svc_get_buffer.
- * it needs to be called before vpx_svc_get_buffer.
- */
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer with encoded data. encoder will maintain a list of frame
- * buffers. each call of vpx_svc_get_buffer() will return one frame.
- */
-void *vpx_svc_get_buffer(SvcContext *svc_ctx);
-
-/**
- * return size of two pass rate control stats data to be returned by
- * vpx_svc_get_rc_stats_buffer
- */
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer two pass of rate control stats data
- */
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx);
-
-/**
- * return spatial resolution of the specified layer
- */
-vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
-                                             int layer,
-                                             unsigned int *width,
-                                             unsigned int *height);
-/**
- * return number of frames that have been encoded
- */
-int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx);
-
-/**
- * return 1 if last encoded frame was a keyframe
- */
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx);
-
-/**
- * force the next frame to be a keyframe
- */
-void vpx_svc_set_keyframe(SvcContext *svc_ctx);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index 796a7a1..31120df 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h

@@ -10,15 +10,16 @@
 #ifndef VPX_VP8CX_H_
 #define VPX_VP8CX_H_
 
-/*!\defgroup vp8_encoder WebM VP8 Encoder
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
  *
  * @{
  */
 #include "./vp8.h"
+#include "./vpx_encoder.h"
 
 /*!\file
- * \brief Provides definitions for using the VP8 encoder algorithm within the
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
  *        vpx Codec Interface.
  */
 
@@ -28,19 +29,30 @@
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to encode raw VP8 streams, as would
- * be found in AVI files.
+ * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
-
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
-extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
-extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
-
 /*!@} - end algorithm interface member group*/
 
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\name Algorithm interface for VP10
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp10_cx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
+/*!@} - end algorithm interface member group*/
 
 /*
  * Algorithm Flags
@@ -121,61 +133,145 @@
 #define VP8_EFLAG_NO_UPD_ENTROPY   (1<<20)
 
 
-/*!\brief VP8 encoder control functions
+/*!\brief VPx encoder control functions
  *
- * This set of macros define the control functions available for the VP8
+ * This set of macros define the control functions available for VPx
  * encoder interface.
  *
  * \sa #vpx_codec_control
  */
 enum vp8e_enc_control_id {
-  VP8E_UPD_ENTROPY           = 5,  /**< control function to set mode of entropy update in encoder */
-  VP8E_UPD_REFERENCE,              /**< control function to set reference update mode in encoder */
-  VP8E_USE_REFERENCE,              /**< control function to set which reference frame encoder can use */
-  VP8E_SET_ROI_MAP,                /**< control function to pass an ROI map to encoder */
-  VP8E_SET_ACTIVEMAP,              /**< control function to pass an Active map to encoder */
-  VP8E_SET_SCALEMODE         = 11, /**< control function to set encoder scaling mode */
-  /*!\brief control function to set vp8 encoder cpuused
+  /*!\brief Codec control function to set mode of entropy update in encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_UPD_ENTROPY           = 5,
+
+  /*!\brief Codec control function to set reference update mode in encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_UPD_REFERENCE,
+
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_USE_REFERENCE,
+
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ROI_MAP,
+
+  /*!\brief Codec control function to pass an Active map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set encoder scaling mode.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SCALEMODE         = 11,
+
+  /*!\brief Codec control function to set encoder internal speed settings.
    *
    * Changes in this value influences, among others, the encoder's selection
    * of motion estimation methods. Values greater than 0 will increase encoder
    * speed at the expense of quality.
-   * The full set of adjustments can be found in
-   * onyx_if.c:vp8_set_speed_features().
-   * \todo List highlights of the changes at various levels.
    *
-   * \note Valid range: -16..16
+   * \note Valid range for VP8: -16..16
+   * \note Valid range for VP9: -8..8
+   *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CPUUSED           = 13,
-  VP8E_SET_ENABLEAUTOALTREF,       /**< control function to enable vp8 to automatic set and use altref frame */
-  VP8E_SET_NOISE_SENSITIVITY,      /**< control function to set noise sensitivity */
-  VP8E_SET_SHARPNESS,              /**< control function to set sharpness */
-  VP8E_SET_STATIC_THRESHOLD,       /**< control function to set the threshold for macroblocks treated static */
-  VP8E_SET_TOKEN_PARTITIONS,       /**< control function to set the number of token partitions  */
-  VP8E_GET_LAST_QUANTIZER,         /**< return the quantizer chosen by the
-                                          encoder for the last frame using the internal
-                                          scale */
-  VP8E_GET_LAST_QUANTIZER_64,      /**< return the quantizer chosen by the
-                                          encoder for the last frame, using the 0..63
-                                          scale as used by the rc_*_quantizer config
-                                          parameters */
-  VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
-  VP8E_SET_ARNR_STRENGTH,          //!< control function to set the filter
-                                   //!< strength for the arf
 
-  /*!\deprecated control function to set the filter type to use for the arf */
+  /*!\brief Codec control function to enable automatic set and use alf frames.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ENABLEAUTOALTREF,
+
+  /*!\brief control function to set noise sensitivity
+   *
+   * 0: off, 1: OnYOnly, 2: OnYUV,
+   * 3: OnYUVAggressive, 4: Adaptive
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to set sharpness.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SHARPNESS,
+
+  /*!\brief Codec control function to set the threshold for MBs treated static.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_STATIC_THRESHOLD,
+
+  /*!\brief Codec control function to set the number of token partitions.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TOKEN_PARTITIONS,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER_64,
+
+  /*!\brief Codec control function to set the max no of frames to create arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_MAXFRAMES,
+
+  /*!\brief Codec control function to set the filter strength for the arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_STRENGTH,
+
+  /*!\deprecated control function to set the filter type to use for the arf. */
   VP8E_SET_ARNR_TYPE,
 
-  VP8E_SET_TUNING,                 /**< control function to set visual tuning */
-  /*!\brief control function to set constrained quality level
+  /*!\brief Codec control function to set visual tuning.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_TUNING,
+
+  /*!\brief Codec control function to set constrained quality level.
    *
    * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
    *            set to #VPX_CQ.
    * \note Valid range: 0..63
+   *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CQ_LEVEL,
 
-  /*!\brief Max data rate for Intra frames
+  /*!\brief Codec control function to set Max data rate for Intra frames.
    *
    * This value controls additional clamping on the maximum size of a
    * keyframe. It is expressed as a percentage of the average
@@ -186,27 +282,271 @@
    * For example, to allocate no more than 4.5 frames worth of bitrate
    * to a keyframe, set this to 450.
    *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_MAX_INTRA_BITRATE_PCT,
 
+  /*!\brief Codec control function to set reference and update frame flags.
+   *
+   *  Supported in codecs: VP8
+   */
+  VP8E_SET_FRAME_FLAGS,
 
-  /* TODO(jkoleszar): Move to vp9cx.h */
+  /*!\brief Codec control function to set max data rate for Inter frames.
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_INTER_BITRATE_PCT,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to set the temporal layer id.
+   *
+   * For temporal scalability: this control allows the application to set the
+   * layer id for each frame to be encoded. Note that this control must be set
+   * for every frame prior to encoding. The usage of this control function
+   * supersedes the internal temporal pattern counter, which is now deprecated.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TEMPORAL_LAYER_ID,
+
+  /*!\brief Codec control function to set encoder screen content mode.
+   *
+   * 0: off, 1: On, 2: On with more aggressive rate control.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_SCREEN_CONTENT_MODE,
+
+  /*!\brief Codec control function to set lossless encoding mode.
+   *
+   * VP9 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_LOSSLESS,
+
+  /*!\brief Codec control function to set number of tile columns.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature.
+   *
+   * VP9 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode.
+   *
+   * VP9 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost.
+   *
+   * One VP9 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_FRAME_PERIODIC_BOOST,
 
+  /*!\brief Codec control function to set noise sensitivity.
+   *
+   *  0: off, 1: On(YOnly)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to turn on/off SVC in encoder.
+   * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+   *       support SVC in its current encoding mode
+   *  0: off, 1: on
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_SVC,
+
+  /*!\brief Codec control function to set parameters for SVC.
+   * \note Parameters contain min_q, max_q, scaling factor for each of the
+   *       SVC layers.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_SVC_PARAMETERS,
-  /*!\brief control function to set svc layer for spatial and temporal.
+
+  /*!\brief Codec control function to set svc layer for spatial and temporal.
    * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial
    *                     layer and 0..#vpx_codec_enc_cfg::ts_number_layers for
    *                     temporal layer.
+   *
+   * Supported in codecs: VP9
    */
   VP9E_SET_SVC_LAYER_ID,
-  VP9E_SET_TUNE_CONTENT
+
+  /*!\brief Codec control function to set content type.
+   * \note Valid parameter range:
+   *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
+   *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TUNE_CONTENT,
+
+  /*!\brief Codec control function to get svc layer ID.
+   * \note The layer ID returned is for the data packet from the registered
+   *       callback function.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_LAYER_ID,
+
+  /*!\brief Codec control function to register callback to get per layer packet.
+   * \note Parameter for this control function is a structure with a callback
+   *       function and a pointer to private data used by the callback.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_REGISTER_CX_CALLBACK,
+
+  /*!\brief Codec control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_COLOR_SPACE,
+
+  /*!\brief Codec control function to set temporal layering mode.
+   * \note Valid ranges: 0..3, default is "0" (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
+   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
+   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
+   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
+   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TEMPORAL_LAYERING_MODE,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 4.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MIN_GF_INTERVAL,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 16.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_GF_INTERVAL,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_ACTIVEMAP,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -220,6 +560,32 @@
   VP8E_ONETWO      = 3
 } VPX_SCALING_MODE;
 
+/*!\brief Temporal layering mode enum for VP9 SVC.
+ *
+ * This set of macros define the different temporal layering modes.
+ * Supported codecs: VP9 (in SVC mode)
+ *
+ */
+typedef enum vp9e_temporal_layering_mode {
+  /*!\brief No temporal layering.
+   * Used when only spatial layering is used.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING   = 0,
+
+  /*!\brief Bypass mode.
+   * Used when application needs to control temporal layering.
+   * This will only work when the number of spatial layers equals 1.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_BYPASS       = 1,
+
+  /*!\brief 0-1-0-1... temporal layering scheme with two temporal layers.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_0101         = 2,
+
+  /*!\brief 0-2-1-2... temporal layering scheme with three temporal layers.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_0212         = 3
+} VP9E_TEMPORAL_LAYERING_MODE;
 
 /*!\brief  vpx region of interest map
  *
@@ -295,24 +661,6 @@
   VP8_TUNE_SSIM
 } vp8e_tuning;
 
-/*!\brief  vp9 svc parameters
- *
- * This defines parameters for svc encoding.
- *
- */
-typedef struct vpx_svc_parameters {
-  unsigned int width;         /**< width of current spatial layer */
-  unsigned int height;        /**< height of current spatial layer */
-  int spatial_layer;          /**< current spatial layer number - 0 = base */
-  int temporal_layer;         /**< current temporal layer number - 0 = base */
-  int max_quantizer;          /**< max quantizer for current layer */
-  int min_quantizer;          /**< min quantizer for current layer */
-  int distance_from_i_frame;  /**< frame number within current gop */
-  int lst_fb_idx;             /**< last frame frame buffer index */
-  int gld_fb_idx;             /**< golden frame frame buffer index */
-  int alt_fb_idx;             /**< alt reference frame frame buffer index */
-} vpx_svc_parameters_t;
-
 /*!\brief  vp9 svc layer parameters
  *
  * This defines the spatial and temporal layer id numbers for svc encoding.
@@ -340,12 +688,15 @@
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_REFERENCE,          int)
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE,          int)
 
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID,  int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     vpx_svc_parameters_t *)
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     void *)
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK,   void *)
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID,       vpx_svc_layer_id_t *)
 
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
@@ -366,8 +717,14 @@
 
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID,  vpx_svc_layer_id_t *)
 
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 
@@ -377,7 +734,29 @@
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 
+VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
+
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
+
+/*!\brief
+ *
+ * TODO(debargha) : add support of the control in ffmpeg
+ */
+#define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
+
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
+/*!\brief
+ *
+ * TODO(debargha) : add support of the control in ffmpeg
+ */
+#define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h
index bd7f19c..27b9f78 100644
--- a/libvpx/vpx/vp8dx.h
+++ b/libvpx/vpx/vp8dx.h

@@ -9,13 +9,13 @@
  */
 
 
-/*!\defgroup vp8_decoder WebM VP8 Decoder
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
  * \ingroup vp8
  *
  * @{
  */
 /*!\file
- * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
 #ifndef VPX_VP8DX_H_
@@ -30,18 +30,30 @@
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to decode raw VP8 streams, as would
- * be found in AVI files and other non-Flash uses.
+ * This interface provides the capability to decode VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 
+/*!\name Algorithm interface for VP10
+ *
+ * This interface provides the capability to decode VP10 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp10_dx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp10_dx(void);
+/*!@} - end algorithm interface member group*/
 
 /*!\enum vp8_dec_control_id
  * \brief VP8 decoder control functions
@@ -72,12 +84,43 @@
   VPXD_SET_DECRYPTOR,
   VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR,
 
-  /** control function to get the display dimensions for the current frame. */
+  /** control function to get the dimensions that the current frame is decoded
+   * at. This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */
+  VP9D_GET_FRAME_SIZE,
+
+  /** control function to get the current frame's intended display dimensions
+   * (as specified in the wrapper or frame header). This may be different to
+   * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */
   VP9D_GET_DISPLAY_SIZE,
 
-  /** For testing. */
+  /** control function to get the bit depth of the stream. */
+  VP9D_GET_BIT_DEPTH,
+
+  /** control function to set the byte alignment of the planes in the reference
+   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  VP9_SET_BYTE_ALIGNMENT,
+
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
   VP9_INVERT_TILE_DECODE_ORDER,
 
+  /** control function to set the skip loop filter flag. Valid values are
+   * integers. The decoder will skip the loop filter when its value is set to
+   * nonzero. If the loop filter is skipped the decoder may accumulate decode
+   * artifacts. The default value is 0.
+   */
+  VP9_SET_SKIP_LOOP_FILTER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -118,6 +161,8 @@
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE,          int *)
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 
 /*! @} - end defgroup vp8_decoder */

diff --git a/libvpx/vpx/vpx_codec.h b/libvpx/vpx/vpx_codec.h
index 07df72a..b6037bb 100644
--- a/libvpx/vpx/vpx_codec.h
+++ b/libvpx/vpx/vpx_codec.h

@@ -69,7 +69,7 @@
 
   /*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
-#elif __GNUC__
+#elif defined(__GNUC__) || defined(__clang__)
 #define UNUSED __attribute__ ((unused))
 #else
 #define UNUSED
@@ -83,7 +83,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_CODEC_ABI_VERSION (2 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
   /*!\brief Algorithm return codes */
   typedef enum {
@@ -203,9 +203,11 @@
     const char              *err_detail;  /**< Detailed info, if available */
     vpx_codec_flags_t        init_flags;  /**< Flags passed at init time */
     union {
-      struct vpx_codec_dec_cfg  *dec;   /**< Decoder Configuration Pointer */
-      struct vpx_codec_enc_cfg  *enc;   /**< Encoder Configuration Pointer */
-      void                      *raw;
+      /**< Decoder Configuration Pointer */
+      const struct vpx_codec_dec_cfg *dec;
+      /**< Encoder Configuration Pointer */
+      const struct vpx_codec_enc_cfg *enc;
+      const void                     *raw;
     }                        config;      /**< Configuration pointer aliasing union */
     vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
   } vpx_codec_ctx_t;
@@ -215,9 +217,9 @@
    * This enumeration determines the bit depth of the codec.
    */
   typedef enum vpx_bit_depth {
-    VPX_BITS_8,   /**< 8 bits  */
-    VPX_BITS_10,  /**< 10 bits */
-    VPX_BITS_12   /**< 12 bits */
+    VPX_BITS_8  =  8,  /**<  8 bits */
+    VPX_BITS_10 = 10,  /**< 10 bits */
+    VPX_BITS_12 = 12,  /**< 12 bits */
   } vpx_bit_depth_t;
 
   /*

diff --git a/libvpx/vpx/vpx_codec.mk b/libvpx/vpx/vpx_codec.mk
index a1ad3c5..ccdef04 100644
--- a/libvpx/vpx/vpx_codec.mk
+++ b/libvpx/vpx/vpx_codec.mk

@@ -31,17 +31,17 @@
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
 
-API_SRCS-yes                += src/vpx_decoder.c
-API_SRCS-yes                += vpx_decoder.h
-API_SRCS-yes                += src/vpx_encoder.c
-API_SRCS-yes                += vpx_encoder.h
-API_SRCS-yes                += internal/vpx_codec_internal.h
-API_SRCS-yes                += internal/vpx_psnr.h
-API_SRCS-yes                += src/vpx_codec.c
-API_SRCS-yes                += src/vpx_image.c
-API_SRCS-yes                += src/vpx_psnr.c
-API_SRCS-yes                += vpx_codec.h
-API_SRCS-yes                += vpx_codec.mk
-API_SRCS-yes                += vpx_frame_buffer.h
-API_SRCS-yes                += vpx_image.h
-API_SRCS-$(BUILD_LIBVPX)    += vpx_integer.h
+API_SRCS-yes += src/vpx_decoder.c
+API_SRCS-yes += vpx_decoder.h
+API_SRCS-yes += src/vpx_encoder.c
+API_SRCS-yes += vpx_encoder.h
+API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_psnr.h
+API_SRCS-yes += src/vpx_codec.c
+API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_psnr.c
+API_SRCS-yes += vpx_codec.h
+API_SRCS-yes += vpx_codec.mk
+API_SRCS-yes += vpx_frame_buffer.h
+API_SRCS-yes += vpx_image.h
+API_SRCS-yes += vpx_integer.h

diff --git a/libvpx/vpx/vpx_decoder.h b/libvpx/vpx/vpx_decoder.h
index 10b89fa..62fd919 100644
--- a/libvpx/vpx/vpx_decoder.h
+++ b/libvpx/vpx/vpx_decoder.h

@@ -135,7 +135,7 @@
    */
   vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_dec_cfg_t  *cfg,
+                                         const vpx_codec_dec_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 

diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index 7dbbf2f..2b17f98 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h

@@ -42,8 +42,11 @@
   /*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */
 #define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY
 
-  /*!\deprecated Use #VPX_TS_MAX_LAYERS instead. */
-#define MAX_LAYERS      VPX_TS_MAX_LAYERS
+/*! Temporal+Spatial Scalability: Maximum number of coding layers */
+#define VPX_MAX_LAYERS  12  // 3 temporal + 4 spatial layers are allowed.
+
+/*!\deprecated Use #VPX_MAX_LAYERS instead. */
+#define MAX_LAYERS    VPX_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
 
 /*! Spatial Scalability: Maximum number of coding layers */
 #define VPX_SS_MAX_LAYERS       5
@@ -59,7 +62,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_ENCODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 
   /*! \brief Encoder capabilities bitfield
@@ -80,6 +83,9 @@
    */
 #define VPX_CODEC_CAP_OUTPUT_PARTITION  0x20000
 
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH  0x40000
 
   /*! \brief Initialization-time Feature Enabling
    *
@@ -91,6 +97,7 @@
 #define VPX_CODEC_USE_PSNR  0x10000 /**< Calculate PSNR on each frame */
 #define VPX_CODEC_USE_OUTPUT_PARTITION  0x20000 /**< Make the encoder output one
   partition at a time. */
+#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 
 
   /*!\brief Generic fixed size buffer structure
@@ -157,8 +164,11 @@
     VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
     VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
     VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-#if CONFIG_SPATIAL_SVC
+    // Spatial SVC is still experimental and may be removed before the next ABI
+    // bump.
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
     VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
+    VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
 #endif
     VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
   };
@@ -188,16 +198,19 @@
                                               has id 0.*/
 
       } frame;  /**< data for compressed frame packet */
-      struct vpx_fixed_buf twopass_stats;  /**< data for two-pass packet */
-      struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */
+      vpx_fixed_buf_t twopass_stats;  /**< data for two-pass packet */
+      vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
       struct vpx_psnr_pkt {
         unsigned int samples[4];  /**< Number of samples, total/y/u/v */
         uint64_t     sse[4];      /**< sum squared error, total/y/u/v */
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
-      struct vpx_fixed_buf raw;     /**< data for arbitrary packets */
-#if CONFIG_SPATIAL_SVC
+      vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
+      // Spatial SVC is still experimental and may be removed before the next
+      // ABI bump.
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
+      struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif
 
       /* This packet size is fixed to allow codecs to extend this
@@ -210,6 +223,22 @@
   } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
 
 
+  /*!\brief Encoder return output buffer callback
+   *
+   * This callback function, when registered, returns with packets when each
+   * spatial layer is encoded.
+   */
+  // putting the definitions here for now. (agrange: find if there
+  // is a better place for this)
+  typedef void (* vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
+                                                       void *user_data);
+
+  /*!\brief Callback function pointer / user data pair storage */
+  typedef struct vpx_codec_enc_output_cx_cb_pair {
+    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+    void                            *user_priv; /**< Pointer to private data */
+  } vpx_codec_priv_output_cx_pkt_cb_pair_t;
+
   /*!\brief Rational Number
    *
    * This structure holds a fractional value.
@@ -324,6 +353,21 @@
      */
     unsigned int           g_h;
 
+    /*!\brief Bit-depth of the codec
+     *
+     * This value identifies the bit_depth of the codec,
+     * Only certain bit-depths are supported as identified in the
+     * vpx_bit_depth_t enum.
+     */
+    vpx_bit_depth_t        g_bit_depth;
+
+    /*!\brief Bit-depth of the input frames
+     *
+     * This value identifies the bit_depth of the input frames in bits.
+     * Note that the frames passed as input to the encoder must have
+     * this bit-depth.
+     */
+    unsigned int           g_input_bit_depth;
 
     /*!\brief Stream timebase units
      *
@@ -452,14 +496,14 @@
      * A buffer containing all of the stats packets produced in the first
      * pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_twopass_stats_in;
+    vpx_fixed_buf_t   rc_twopass_stats_in;
 
     /*!\brief first pass mb stats buffer.
      *
      * A buffer containing all of the first pass mb stats packets produced
      * in the first pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_firstpass_mb_stats_in;
+    vpx_fixed_buf_t   rc_firstpass_mb_stats_in;
 
     /*!\brief Target data rate
      *
@@ -688,8 +732,37 @@
      * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1).
     */
     unsigned int           ts_layer_id[VPX_TS_MAX_PERIODICITY];
+
+    /*!\brief Target bitrate for each spatial/temporal layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * spatial/temporal layer.
+     *
+     */
+    unsigned int           layer_target_bitrate[VPX_MAX_LAYERS];
+
+    /*!\brief Temporal layering mode indicating which temporal layering scheme to use.
+     *
+     * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the
+     * temporal layering mode to use.
+     *
+     */
+    int                    temporal_layering_mode;
   } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
+  /*!\brief  vp9 svc extra configure parameters
+   *
+   * This defines max/min quantizers and scale factors for each layer
+   *
+   */
+  typedef struct vpx_svc_parameters {
+    int max_quantizers[VPX_MAX_LAYERS]; /**< Max Q for each layer */
+    int min_quantizers[VPX_MAX_LAYERS]; /**< Min Q for each layer */
+    int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */
+    int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
+    int temporal_layering_mode; /**< Temporal layering mode */
+  } vpx_svc_extra_cfg_t;
+
 
   /*!\brief Initialize an encoder instance
    *
@@ -715,7 +788,7 @@
    */
   vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_enc_cfg_t  *cfg,
+                                         const vpx_codec_enc_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 
@@ -774,9 +847,9 @@
    * be called by all applications to initialize the configuration structure
    * before specializing the configuration with application specific values.
    *
-   * \param[in]    iface   Pointer to the algorithm interface to use.
-   * \param[out]   cfg     Configuration buffer to populate
-   * \param[in]    usage   End usage. Set to 0 or use codec specific values.
+   * \param[in]    iface     Pointer to the algorithm interface to use.
+   * \param[out]   cfg       Configuration buffer to populate.
+   * \param[in]    reserved  Must set to 0 for VP8 and VP9.
    *
    * \retval #VPX_CODEC_OK
    *     The configuration was populated.
@@ -787,7 +860,7 @@
    */
   vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
                                                 vpx_codec_enc_cfg_t  *cfg,
-                                                unsigned int          usage);
+                                                unsigned int          reserved);
 
 
   /*!\brief Set or change configuration

diff --git a/libvpx/vpx/vpx_frame_buffer.h b/libvpx/vpx/vpx_frame_buffer.h
index e69df4b..9036459 100644
--- a/libvpx/vpx/vpx_frame_buffer.h
+++ b/libvpx/vpx/vpx_frame_buffer.h

@@ -22,8 +22,11 @@
 #include "./vpx_integer.h"
 
 /*!\brief The maximum number of work buffers used by libvpx.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
  */
-#define VPX_MAXIMUM_WORK_BUFFERS 1
+#define VPX_MAXIMUM_WORK_BUFFERS 8
 
 /*!\brief The maximum number of reference buffers that a VP9 encoder may use.
  */
@@ -43,15 +46,15 @@
  *
  * This callback is invoked by the decoder to retrieve data for the frame
  * buffer in order for the decode call to complete. The callback must
- * allocate at least min_size in bytes and assign it to fb->data. Then the
- * callback must set fb->size to the allocated size. The application does not
- * need to align the allocated data. The callback is triggered when the
- * decoder needs a frame buffer to decode a compressed image into. This
- * function may be called more than once for every call to vpx_codec_decode.
- * The application may set fb->priv to some data which will be passed
- * back in the ximage and the release function call. |fb| is guaranteed to
- * not be NULL. On success the callback must return 0. Any failure the
- * callback must return a value less than 0.
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to vpx_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the ximage and the release function
+ * call. |fb| is guaranteed to not be NULL. On success the callback must
+ * return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] new_size     Size in bytes needed by the buffer

diff --git a/libvpx/vpx/vpx_image.h b/libvpx/vpx/vpx_image.h
index 7b04b70..c06d351 100644
--- a/libvpx/vpx/vpx_image.h
+++ b/libvpx/vpx/vpx_image.h

@@ -28,13 +28,13 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
 
 
-#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
-#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U plane in memory */
-#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel component */
-#define VPX_IMG_FMT_HIGH       0x800  /**< Image uses 16bit framebuffer */
+#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800  /**< Image uses 16bit framebuffer. */
 
   /*!\brief List of supported image formats */
   typedef enum vpx_img_fmt {
@@ -58,49 +58,30 @@
     VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
     VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
     VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
-    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
-    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH,
-    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH,
-    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH
+    VPX_IMG_FMT_I440    = VPX_IMG_FMT_PLANAR | 7,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
+    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
-#define IMG_FMT_UV_FLIP        VPX_IMG_FMT_UV_FLIP    /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
-#define IMG_FMT_HAS_ALPHA      VPX_IMG_FMT_HAS_ALPHA  /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */
-
-  /*!\brief Deprecated list of supported image formats
-   * \deprecated New code should use #vpx_img_fmt
-   */
-#define img_fmt   vpx_img_fmt
-  /*!\brief alias for enum img_fmt.
-   * \deprecated New code should use #vpx_img_fmt_t
-   */
-#define img_fmt_t vpx_img_fmt_t
-
-#define IMG_FMT_NONE       VPX_IMG_FMT_NONE       /**< \deprecated Use #VPX_IMG_FMT_NONE */
-#define IMG_FMT_RGB24      VPX_IMG_FMT_RGB24      /**< \deprecated Use #VPX_IMG_FMT_RGB24 */
-#define IMG_FMT_RGB32      VPX_IMG_FMT_RGB32      /**< \deprecated Use #VPX_IMG_FMT_RGB32 */
-#define IMG_FMT_RGB565     VPX_IMG_FMT_RGB565     /**< \deprecated Use #VPX_IMG_FMT_RGB565 */
-#define IMG_FMT_RGB555     VPX_IMG_FMT_RGB555     /**< \deprecated Use #VPX_IMG_FMT_RGB555 */
-#define IMG_FMT_UYVY       VPX_IMG_FMT_UYVY       /**< \deprecated Use #VPX_IMG_FMT_UYVY */
-#define IMG_FMT_YUY2       VPX_IMG_FMT_YUY2       /**< \deprecated Use #VPX_IMG_FMT_YUY2 */
-#define IMG_FMT_YVYU       VPX_IMG_FMT_YVYU       /**< \deprecated Use #VPX_IMG_FMT_YVYU */
-#define IMG_FMT_BGR24      VPX_IMG_FMT_BGR24      /**< \deprecated Use #VPX_IMG_FMT_BGR24 */
-#define IMG_FMT_RGB32_LE   VPX_IMG_FMT_RGB32_LE   /**< \deprecated Use #VPX_IMG_FMT_RGB32_LE */
-#define IMG_FMT_ARGB       VPX_IMG_FMT_ARGB       /**< \deprecated Use #VPX_IMG_FMT_ARGB */
-#define IMG_FMT_ARGB_LE    VPX_IMG_FMT_ARGB_LE    /**< \deprecated Use #VPX_IMG_FMT_ARGB_LE */
-#define IMG_FMT_RGB565_LE  VPX_IMG_FMT_RGB565_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB565_LE */
-#define IMG_FMT_RGB555_LE  VPX_IMG_FMT_RGB555_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB555_LE */
-#define IMG_FMT_YV12       VPX_IMG_FMT_YV12       /**< \deprecated Use #VPX_IMG_FMT_YV12 */
-#define IMG_FMT_I420       VPX_IMG_FMT_I420       /**< \deprecated Use #VPX_IMG_FMT_I420 */
-#define IMG_FMT_VPXYV12    VPX_IMG_FMT_VPXYV12    /**< \deprecated Use #VPX_IMG_FMT_VPXYV12 */
-#define IMG_FMT_VPXI420    VPX_IMG_FMT_VPXI420    /**< \deprecated Use #VPX_IMG_FMT_VPXI420 */
-#endif /* VPX_CODEC_DISABLE_COMPAT */
+  /*!\brief List of supported color spaces */
+  typedef enum vpx_color_space {
+    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
+    VPX_CS_BT_601     = 1,  /**< BT.601 */
+    VPX_CS_BT_709     = 2,  /**< BT.709 */
+    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
+    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
+    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
+    VPX_CS_RESERVED   = 6,  /**< Reserved */
+    VPX_CS_SRGB       = 7   /**< sRGB */
+  } vpx_color_space_t; /**< alias for enum vpx_color_space */
 
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
+    vpx_color_space_t cs; /**< Color Space */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
@@ -121,13 +102,6 @@
 #define VPX_PLANE_U      1   /**< U (Chroma) plane */
 #define VPX_PLANE_V      2   /**< V (Chroma) plane */
 #define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define PLANE_PACKED     VPX_PLANE_PACKED
-#define PLANE_Y          VPX_PLANE_Y
-#define PLANE_U          VPX_PLANE_U
-#define PLANE_V          VPX_PLANE_V
-#define PLANE_ALPHA      VPX_PLANE_ALPHA
-#endif
     unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
     int      stride[4];  /**< stride between rows for each plane */
 

diff --git a/libvpx/vpx/vpx_integer.h b/libvpx/vpx/vpx_integer.h
index ffeefb8..829c9d1 100644
--- a/libvpx/vpx/vpx_integer.h
+++ b/libvpx/vpx/vpx_integer.h

@@ -37,6 +37,8 @@
 typedef signed __int64   int64_t;
 typedef unsigned __int64 uint64_t;
 #define INT64_MAX _I64_MAX
+#define INT32_MAX _I32_MAX
+#define INT32_MIN _I32_MIN
 #define INT16_MAX _I16_MAX
 #define INT16_MIN _I16_MIN
 #endif
@@ -49,9 +51,15 @@
 
 /* Most platforms have the C99 standard integer types. */
 
-#if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS)
-#define __STDC_FORMAT_MACROS
-#endif
+#if defined(__cplusplus)
+# if !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS
+# endif
+# if !defined(__STDC_LIMIT_MACROS)
+#  define __STDC_LIMIT_MACROS
+# endif
+#endif  // __cplusplus
+
 #include <stdint.h>
 
 #endif

diff --git a/libvpx/vpx_dsp/arm/bilinear_filter_media.asm b/libvpx/vpx_dsp/arm/bilinear_filter_media.asm
new file mode 100644
index 0000000..f3f9754
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/bilinear_filter_media.asm

@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vpx_filter_block2d_second_pass_media|
+
+    END

diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000..9f9de98
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c

@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
similarity index 97%
rename from libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
index b1fd21b..dc459e2 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm

@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_idct16x16_1_add_neon|
+    EXPORT  |vpx_idct16x16_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                    int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct16x16_1_add_neon| PROC
+|vpx_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -193,6 +193,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_idct16x16_1_add_neon|
+    ENDP             ; |vpx_idct16x16_1_add_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 0000000..f734e48
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c

@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct16x16_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, j, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    for (d1 = d2 = dest, i = 0; i < 4; i++) {
+        for (j = 0; j < 2; j++) {
+            d2u64 = vld1_u64((const uint64_t *)d1);
+            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+            d4u64 = vld1_u64((const uint64_t *)d1);
+            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+
+            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+            d2 += dest_stride;
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+            d2 += dest_stride;
+        }
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm b/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
similarity index 97%
rename from libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
index a13c0d0..22a0c95 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm

@@ -8,10 +8,10 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_idct16x16_256_add_neon_pass1|
-    EXPORT  |vp9_idct16x16_256_add_neon_pass2|
-    EXPORT  |vp9_idct16x16_10_add_neon_pass1|
-    EXPORT  |vp9_idct16x16_10_add_neon_pass2|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
     ARM
     REQUIRE8
     PRESERVE8
@@ -36,7 +36,7 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
+;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
 ;                                          int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -46,7 +46,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_idct16x16_256_add_neon_pass1| PROC
+|vpx_idct16x16_256_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -273,9 +273,9 @@
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
 
-;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
+;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
 ;                                        int16_t *output,
 ;                                        int16_t *pass1Output,
 ;                                        int16_t skip_adding,
@@ -292,7 +292,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_idct16x16_256_add_neon_pass2| PROC
+|vpx_idct16x16_256_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -784,9 +784,9 @@
 end_idct16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
 
-;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
+;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -796,7 +796,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_idct16x16_10_add_neon_pass1| PROC
+|vpx_idct16x16_10_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -905,9 +905,9 @@
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
 
-;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
+;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -924,7 +924,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_idct16x16_10_add_neon_pass2| PROC
+|vpx_idct16x16_10_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -1175,5 +1175,5 @@
 end_idct10_16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
     END

diff --git a/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 0000000..651ebb2
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct16x16_add_neon.c

@@ -0,0 +1,1317 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+void vpx_idct16x16_256_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d18s16, d1s16);
+    q6s32 = vmull_s16(d19s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q5s32, 14);
+    d15s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    q2s32 = vmull_s16(d26s16, d2s16);
+    q3s32 = vmull_s16(d27s16, d2s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q15s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+    d10s16 = vqrshrn_n_s32(q2s32, 14);
+    d11s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q15s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d30s16);
+    q11s32 = vmull_s16(d17s16, d30s16);
+    q0s32 = vmull_s16(d24s16, d30s16);
+    q1s32 = vmull_s16(d25s16, d30s16);
+
+    d30s16 = vdup_n_s16(cospi_24_64);
+    d31s16 = vdup_n_s16(cospi_8_64);
+
+    q3s32 = vaddq_s32(q2s32, q0s32);
+    q12s32 = vaddq_s32(q11s32, q1s32);
+    q13s32 = vsubq_s32(q2s32, q0s32);
+    q1s32 = vsubq_s32(q11s32, q1s32);
+
+    d16s16 = vqrshrn_n_s32(q3s32, 14);
+    d17s16 = vqrshrn_n_s32(q12s32, 14);
+    d18s16 = vqrshrn_n_s32(q13s32, 14);
+    d19s16 = vqrshrn_n_s32(q1s32, 14);
+    q8s16 = vcombine_s16(d16s16, d17s16);
+    q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q0s32 = vmull_s16(d20s16, d31s16);
+    q1s32 = vmull_s16(d21s16, d31s16);
+    q12s32 = vmull_s16(d20s16, d30s16);
+    q13s32 = vmull_s16(d21s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+    d22s16 = vqrshrn_n_s32(q0s32, 14);
+    d23s16 = vqrshrn_n_s32(q1s32, 14);
+    d20s16 = vqrshrn_n_s32(q12s32, 14);
+    d21s16 = vqrshrn_n_s32(q13s32, 14);
+    q10s16 = vcombine_s16(d20s16, d21s16);
+    q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q15s16 = vaddq_s16(q6s16, q7s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    // stage 5
+    q0s16 = vaddq_s16(q8s16, q11s16);
+    q1s16 = vaddq_s16(q9s16, q10s16);
+    q2s16 = vsubq_s16(q9s16, q10s16);
+    q3s16 = vsubq_s16(q8s16, q11s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q11s32 = vmull_s16(d26s16, d16s16);
+    q12s32 = vmull_s16(d27s16, d16s16);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q13s32 = vsubq_s32(q10s32, q12s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d11s16 = vqrshrn_n_s32(q13s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q8s16 = vaddq_s16(q0s16, q15s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d16u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d;
+    uint8x8_t d12u8, d13u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64;
+    int64x1_t d12s64, d13s64;
+    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d12s16 = vdup_n_s16(cospi_30_64);
+    d13s16 = vdup_n_s16(cospi_2_64);
+
+    q2s32 = vmull_s16(d16s16, d12s16);
+    q3s32 = vmull_s16(d17s16, d12s16);
+    q1s32 = vmull_s16(d16s16, d13s16);
+    q4s32 = vmull_s16(d17s16, d13s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+    d0s16 = vqrshrn_n_s32(q2s32, 14);
+    d1s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q1s32, 14);
+    d15s16 = vqrshrn_n_s32(q4s32, 14);
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_14_64);
+    d31s16 = vdup_n_s16(cospi_18_64);
+
+    q2s32 = vmull_s16(d24s16, d30s16);
+    q3s32 = vmull_s16(d25s16, d30s16);
+    q4s32 = vmull_s16(d24s16, d31s16);
+    q5s32 = vmull_s16(d25s16, d31s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q2s32, 14);
+    d3s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q4s32, 14);
+    d13s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(cospi_22_64);
+    d31s16 = vdup_n_s16(cospi_10_64);
+
+    q11s32 = vmull_s16(d20s16, d30s16);
+    q12s32 = vmull_s16(d21s16, d30s16);
+    q4s32 = vmull_s16(d20s16, d31s16);
+    q5s32 = vmull_s16(d21s16, d31s16);
+
+    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d11s16 = vqrshrn_n_s32(q5s32, 14);
+    d10s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    d30s16 = vdup_n_s16(cospi_6_64);
+    d31s16 = vdup_n_s16(cospi_26_64);
+
+    q10s32 = vmull_s16(d28s16, d30s16);
+    q11s32 = vmull_s16(d29s16, d30s16);
+    q12s32 = vmull_s16(d28s16, d31s16);
+    q13s32 = vmull_s16(d29s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q11s32, 14);
+    d8s16 = vqrshrn_n_s32(q12s32, 14);
+    d9s16 = vqrshrn_n_s32(q13s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 3
+    q9s16  = vsubq_s16(q0s16, q1s16);
+    q0s16  = vaddq_s16(q0s16, q1s16);
+    q10s16 = vsubq_s16(q3s16, q2s16);
+    q11s16 = vaddq_s16(q2s16, q3s16);
+    q12s16 = vaddq_s16(q4s16, q5s16);
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16  = vaddq_s16(q6s16, q7s16);
+
+    // stage 4
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q2s32 = vmull_s16(d18s16, d31s16);
+    q3s32 = vmull_s16(d19s16, d31s16);
+    q4s32 = vmull_s16(d28s16, d31s16);
+    q5s32 = vmull_s16(d29s16, d31s16);
+
+    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q3s32, 14);
+    d2s16 = vqrshrn_n_s32(q4s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q3s16 = q11s16;
+    q4s16 = q12s16;
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q11s32 = vmull_s16(d26s16, d30s16);
+    q12s32 = vmull_s16(d27s16, d30s16);
+    q8s32 = vmull_s16(d20s16, d30s16);
+    q9s32 = vmull_s16(d21s16, d30s16);
+
+    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q10s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q10s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    if (skip_adding != 0) {
+        d = dest;
+        // load the data in pass1
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        // store the data  out 8,9,10,11,12,13,14,15
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q8s16 = vrshrq_n_s16(q8s16, 6);
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q9s16 = vrshrq_n_s16(q9s16, 6);
+        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q2s16 = vrshrq_n_s16(q2s16, 6);
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q3s16 = vrshrq_n_s16(q3s16, 6);
+        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q4s16 = vrshrq_n_s16(q4s16, 6);
+        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q5s16 = vrshrq_n_s16(q5s16, 6);
+        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q14s16 = vrshrq_n_s16(q14s16, 6);
+        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        q15s16 = vrshrq_n_s16(q15s16, 6);
+        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    } else {  // skip_adding_dest
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+    }
+    return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d4s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    // stage 4
+    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+    d4s16 = vdup_n_s16(cospi_16_64);
+
+    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+    q9s32  = vmull_s16(d14s16, d4s16);
+    q10s32 = vmull_s16(d15s16, d4s16);
+    q12s32 = vmull_s16(d9s16, d4s16);
+    q11s32 = vmull_s16(d8s16, d4s16);
+
+    q15s32 = vsubq_s32(q10s32, q12s32);
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d11s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q2s16 = vaddq_s16(q8s16, q7s16);
+    q9s16 = vaddq_s16(q8s16, q6s16);
+    q10s16 = vaddq_s16(q8s16, q5s16);
+    q11s16 = vaddq_s16(q8s16, q4s16);
+    q12s16 = vsubq_s16(q8s16, q4s16);
+    q13s16 = vsubq_s16(q8s16, q5s16);
+    q14s16 = vsubq_s16(q8s16, q6s16);
+    q15s16 = vsubq_s16(q8s16, q7s16);
+
+    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d4u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+    (void)skip_adding;
+    (void)dest;
+    (void)dest_stride;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+    // stage 4
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d6s16 = vget_low_s16(q3s16);
+    d7s16 = vget_high_s16(q3s16);
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q12s32 = vmull_s16(d14s16, d31s16);
+    q5s32 = vmull_s16(d15s16, d31s16);
+    q2s32 = vmull_s16(d0s16, d31s16);
+    q11s32 = vmull_s16(d1s16, d31s16);
+
+    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q12s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q11s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q10s32 = vmull_s16(d8s16, d30s16);
+    q13s32 = vmull_s16(d9s16, d30s16);
+    q8s32 = vmull_s16(d6s16, d30s16);
+    q9s32 = vmull_s16(d7s16, d30s16);
+
+    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q10s32, 14);
+    d5s16 = vqrshrn_n_s32(q13s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q0s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q0s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q14s16 = vsubq_s16(q1s16, q14s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
+
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
+
+    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
+    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
+    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
+    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
+    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    vst1_u64((uint64_t *)out, d16u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d4u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d6u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d7u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d8u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d9u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d10u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d11u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vpx_dsp/arm/idct16x16_neon.c
similarity index 79%
rename from libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
rename to libvpx/vpx_dsp/arm/idct16x16_neon.c
index 0b9fc09..352979a 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/libvpx/vpx_dsp/arm/idct16x16_neon.c

@@ -8,50 +8,55 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
-void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
                                       int16_t *output,
                                       int output_stride);
-void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
                                       int16_t *output,
                                       int16_t *pass1Output,
                                       int16_t skip_adding,
                                       uint8_t *dest,
                                       int dest_stride);
-void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
                                      int16_t *output,
                                      int output_stride);
-void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
                                      int16_t *output,
                                      int16_t *pass1Output,
                                      int16_t skip_adding,
                                      uint8_t *dest,
                                      int dest_stride);
 
+#if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vp9_push_neon(int64_t *store);
-extern void vp9_pop_neon(int64_t *store);
+extern void vpx_push_neon(int64_t *store);
+extern void vpx_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM
 
-void vp9_idct16x16_256_add_neon(const int16_t *input,
+void vpx_idct16x16_256_add_neon(const int16_t *input,
                                 uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
   int64_t store_reg[8];
+#endif
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
+#if HAVE_NEON_ASM
   // save d8-d15 register values.
-  vp9_push_neon(store_reg);
+  vpx_push_neon(store_reg);
+#endif
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_idct16x16_256_add_neon_pass2(input+1,
+  vpx_idct16x16_256_add_neon_pass2(input+1,
                                      row_idct_output,
                                      pass1_output,
                                      0,
@@ -61,12 +66,12 @@
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
+  vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      0,
@@ -76,12 +81,12 @@
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -91,42 +96,48 @@
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
                                      dest+8,
                                      dest_stride);
 
+#if HAVE_NEON_ASM
   // restore d8-d15 register values.
-  vp9_pop_neon(store_reg);
+  vpx_pop_neon(store_reg);
+#endif
 
   return;
 }
 
-void vp9_idct16x16_10_add_neon(const int16_t *input,
+void vpx_idct16x16_10_add_neon(const int16_t *input,
                                uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
   int64_t store_reg[8];
+#endif
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
+#if HAVE_NEON_ASM
   // save d8-d15 register values.
-  vp9_push_neon(store_reg);
+  vpx_push_neon(store_reg);
+#endif
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_idct16x16_10_add_neon_pass2(input+1,
+  vpx_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -138,12 +149,12 @@
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -153,20 +164,22 @@
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
                                      dest+8,
                                      dest_stride);
 
+#if HAVE_NEON_ASM
   // restore d8-d15 register values.
-  vp9_pop_neon(store_reg);
+  vpx_pop_neon(store_reg);
+#endif
 
   return;
 }

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
similarity index 95%
rename from libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
index d290d07..96d276b 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm

@@ -7,7 +7,7 @@
 ;  file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_idct32x32_1_add_neon|
+    EXPORT  |vpx_idct32x32_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -64,14 +64,14 @@
     vst1.8           {q15},[$dst], $stride
     MEND
 
-;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
 ;                              int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride
 
-|vp9_idct32x32_1_add_neon| PROC
+|vpx_idct32x32_1_add_neon| PROC
     push             {lr}
     pld              [r1]
     add              r3, r1, #16               ; r3 dest + 16 for second loop
@@ -140,5 +140,5 @@
     bne              diff_positive_32_32_loop
     pop              {pc}
 
-    ENDP             ; |vp9_idct32x32_1_add_neon|
+    ENDP             ; |vpx_idct32x32_1_add_neon|
     END

diff --git a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 0000000..c25c0c4
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c

@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void LD_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vld1q_u8(d);
+    d += d_stride;
+    *q9u8 = vld1q_u8(d);
+    d += d_stride;
+    *q10u8 = vld1q_u8(d);
+    d += d_stride;
+    *q11u8 = vld1q_u8(d);
+    d += d_stride;
+    *q12u8 = vld1q_u8(d);
+    d += d_stride;
+    *q13u8 = vld1q_u8(d);
+    d += d_stride;
+    *q14u8 = vld1q_u8(d);
+    d += d_stride;
+    *q15u8 = vld1q_u8(d);
+    return;
+}
+
+static INLINE void ADD_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void SUB_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void ST_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    vst1q_u8(d, *q8u8);
+    d += d_stride;
+    vst1q_u8(d, *q9u8);
+    d += d_stride;
+    vst1q_u8(d, *q10u8);
+    d += d_stride;
+    vst1q_u8(d, *q11u8);
+    d += d_stride;
+    vst1q_u8(d, *q12u8);
+    d += d_stride;
+    vst1q_u8(d, *q13u8);
+    d += d_stride;
+    vst1q_u8(d, *q14u8);
+    d += d_stride;
+    vst1q_u8(d, *q15u8);
+    return;
+}
+
+void vpx_idct32x32_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int i, j, dest_stride8;
+    uint8_t *d;
+    int16_t a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    dest_stride8 = dest_stride * 8;
+    if (a1 >= 0) {  // diff_positive_32_32
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    } else {  // diff_negative_32_32
+        a1 = -a1;
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
similarity index 99%
rename from libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
index 72e933e..7483ee7 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm

@@ -43,7 +43,7 @@
 cospi_31_64 EQU   804
 
 
-    EXPORT  |vp9_idct32x32_1024_add_neon|
+    EXPORT  |vpx_idct32x32_1024_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -288,7 +288,7 @@
     MEND
     ; --------------------------------------------------------------------------
 
-;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 ;
 ;   r0  int16_t *input,
 ;   r1  uint8_t *dest,
@@ -303,7 +303,7 @@
 ;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
 ;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
 
-|vp9_idct32x32_1024_add_neon| PROC
+|vpx_idct32x32_1024_add_neon| PROC
     ; This function does one pass of idct32x32 transform.
     ;
     ; This is done by transposing the input and then doing a 1d transform on
@@ -1295,5 +1295,5 @@
     vpop {d8-d15}
     pop  {r4-r11}
     bx              lr
-    ENDP  ; |vp9_idct32x32_1024_add_neon|
+    ENDP  ; |vpx_idct32x32_1024_add_neon|
     END

diff --git a/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 0000000..025437e
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct32x32_add_neon.c

@@ -0,0 +1,719 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+    q14s16 = vld1q_s16(trans_buf + first * 8); \
+    q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+    qA = vld1q_s16(out + first * 32); \
+    qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+    vst1q_s16(out + first * 32, qA); \
+    vst1q_s16(out + second * 32, qB);
+
+#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+                                      q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16) {
+    int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+    d8s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d11s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d9s16 = vld1_s16((int16_t *)p1);
+    d10s16 = vld1_s16((int16_t *)p2);
+
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q8s16 = vrshrq_n_s16(q8s16, 6);
+    q9s16 = vrshrq_n_s16(q9s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d9s16)));
+    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                                           vreinterpret_u8_s16(d10s16)));
+    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                                           vreinterpret_u8_s16(d11s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d8s16)));
+
+    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+    vst1_s16((int16_t *)p1, d9s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d10s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p1, d8s16);
+    vst1_s16((int16_t *)p2, d11s16);
+    return;
+}
+
+#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+                                      q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q4s16,
+        int16x8_t q5s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16) {
+    int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+    d4s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d7s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d5s16 = vld1_s16((int16_t *)p1);
+    d6s16 = vld1_s16((int16_t *)p2);
+
+    q5s16 = vrshrq_n_s16(q5s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q4s16 = vrshrq_n_s16(q4s16, 6);
+
+    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                                           vreinterpret_u8_s16(d5s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d6s16)));
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d7s16)));
+    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                                           vreinterpret_u8_s16(d4s16)));
+
+    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+    vst1_s16((int16_t *)p1, d5s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d6s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p2, d7s16);
+    vst1_s16((int16_t *)p1, d4s16);
+    return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(
+        int16x8_t q14s16,
+        int16x8_t q13s16,
+        int16_t first_const,
+        int16_t second_const,
+        int16x8_t *qAs16,
+        int16x8_t *qBs16) {
+    int16x4_t d30s16, d31s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+    int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+    dCs16 = vget_low_s16(q14s16);
+    dDs16 = vget_high_s16(q14s16);
+    dAs16 = vget_low_s16(q13s16);
+    dBs16 = vget_high_s16(q13s16);
+
+    d30s16 = vdup_n_s16(first_const);
+    d31s16 = vdup_n_s16(second_const);
+
+    q8s32 = vmull_s16(dCs16, d30s16);
+    q10s32 = vmull_s16(dAs16, d31s16);
+    q9s32 = vmull_s16(dDs16, d30s16);
+    q11s32 = vmull_s16(dBs16, d31s16);
+    q12s32 = vmull_s16(dCs16, d31s16);
+
+    q8s32 = vsubq_s32(q8s32, q10s32);
+    q9s32 = vsubq_s32(q9s32, q11s32);
+
+    q10s32 = vmull_s16(dDs16, d31s16);
+    q11s32 = vmull_s16(dAs16, d30s16);
+    q15s32 = vmull_s16(dBs16, d30s16);
+
+    q11s32 = vaddq_s32(q12s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q15s32);
+
+    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+                          vqrshrn_n_s32(q9s32, 14));
+    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+                          vqrshrn_n_s32(q10s32, 14));
+    return;
+}
+
+static INLINE void idct32_transpose_pair(
+        int16_t *input,
+        int16_t *t_buf) {
+    int16_t *in;
+    int i;
+    const int stride = 32;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    for (i = 0; i < 4; i++, input += 8) {
+        in = input;
+        q8s16 = vld1q_s16(in);
+        in += stride;
+        q9s16 = vld1q_s16(in);
+        in += stride;
+        q10s16 = vld1q_s16(in);
+        in += stride;
+        q11s16 = vld1q_s16(in);
+        in += stride;
+        q12s16 = vld1q_s16(in);
+        in += stride;
+        q13s16 = vld1q_s16(in);
+        in += stride;
+        q14s16 = vld1q_s16(in);
+        in += stride;
+        q15s16 = vld1q_s16(in);
+
+        d16s16 = vget_low_s16(q8s16);
+        d17s16 = vget_high_s16(q8s16);
+        d18s16 = vget_low_s16(q9s16);
+        d19s16 = vget_high_s16(q9s16);
+        d20s16 = vget_low_s16(q10s16);
+        d21s16 = vget_high_s16(q10s16);
+        d22s16 = vget_low_s16(q11s16);
+        d23s16 = vget_high_s16(q11s16);
+        d24s16 = vget_low_s16(q12s16);
+        d25s16 = vget_high_s16(q12s16);
+        d26s16 = vget_low_s16(q13s16);
+        d27s16 = vget_high_s16(q13s16);
+        d28s16 = vget_low_s16(q14s16);
+        d29s16 = vget_high_s16(q14s16);
+        d30s16 = vget_low_s16(q15s16);
+        d31s16 = vget_high_s16(q15s16);
+
+        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+        q12s16 = vcombine_s16(d17s16, d25s16);
+        q13s16 = vcombine_s16(d19s16, d27s16);
+        q14s16 = vcombine_s16(d21s16, d29s16);
+        q15s16 = vcombine_s16(d23s16, d31s16);
+
+        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                            vreinterpretq_s32_s16(q10s16));
+        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+                            vreinterpretq_s32_s16(q11s16));
+        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+                            vreinterpretq_s32_s16(q14s16));
+        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+                            vreinterpretq_s32_s16(q15s16));
+
+        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+        vst1q_s16(t_buf, q0x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q0x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[1]);
+        t_buf += 8;
+    }
+    return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(
+        int16_t *out,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+    return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+        int16_t *out,
+        uint8_t *dest,
+        int stride,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    uint8_t *r6  = dest + 31 * stride;
+    uint8_t *r7  = dest/* +  0 * stride*/;
+    uint8_t *r9  = dest + 15 * stride;
+    uint8_t *r10 = dest + 16 * stride;
+    int str2 = stride << 1;
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    return;
+}
+
+void vpx_idct32x32_1024_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int stride) {
+    int i, idct32_pass_loop;
+    int16_t trans_buf[32 * 8];
+    int16_t pass1[32 * 32];
+    int16_t pass2[32 * 32];
+    int16_t *out;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+    for (idct32_pass_loop = 0, out = pass1;
+         idct32_pass_loop < 2;
+         idct32_pass_loop++,
+         input = pass1,  // the input of pass2 is the result of pass1
+         out = pass2) {
+        for (i = 0;
+             i < 4; i++,
+             input += 32 * 8, out += 8) {  // idct32_bands_loop
+            idct32_transpose_pair(input, trans_buf);
+
+            // -----------------------------------------
+            // BLOCK A: 16-19,28-31
+            // -----------------------------------------
+            // generate 16,17,30,31
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(0, 1, 31)
+            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(31, 17, 15)
+            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+            // part of stage 2
+            q4s16 = vaddq_s16(q0s16, q1s16);
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q6s16 = vaddq_s16(q2s16, q3s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+            // generate 18,19,28,29
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(15, 9, 23)
+            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(23, 25, 7)
+            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q3s16, q2s16);
+            q3s16 = vaddq_s16(q3s16, q2s16);
+            q14s16 = vsubq_s16(q1s16, q0s16);
+            q2s16 = vaddq_s16(q1s16, q0s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+            // part of stage 4
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q15s16 = vaddq_s16(q6s16, q3s16);
+            q13s16 = vsubq_s16(q5s16, q0s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q4s16, q2s16);
+            q14s16 = vsubq_s16(q6s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+            // -----------------------------------------
+            // BLOCK B: 20-23,24-27
+            // -----------------------------------------
+            // generate 20,21,26,27
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(7, 5, 27)
+            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(27, 21, 11)
+            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+            // generate 22,23,24,25
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(11, 13, 19)
+            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(19, 29, 3)
+            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+            // part of stage 2
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16  = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16  = vaddq_s16(q6s16, q7s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+            // part of stage 4
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q11s16 = vaddq_s16(q5s16, q0s16);
+            q12s16 = vaddq_s16(q6s16, q2s16);
+            q15s16 = vaddq_s16(q4s16, q3s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q11s16);
+            q9s16 = vaddq_s16(q13s16, q10s16);
+            q13s16 = vsubq_s16(q13s16, q10s16);
+            q11s16 = vsubq_s16(q14s16, q11s16);
+            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+            q8s16  = vsubq_s16(q9s16, q12s16);
+            q10s16 = vaddq_s16(q14s16, q15s16);
+            q14s16 = vsubq_s16(q14s16, q15s16);
+            q12s16 = vaddq_s16(q9s16, q12s16);
+            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+            q13s16 = q11s16;
+            q14s16 = q8s16;
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+            // part of stage 4
+            q14s16 = vsubq_s16(q5s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q2s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            q13s16 = vsubq_s16(q4s16, q3s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q1s16);
+            q9s16 = vaddq_s16(q13s16, q6s16);
+            q13s16 = vsubq_s16(q13s16, q6s16);
+            q1s16 = vsubq_s16(q14s16, q1s16);
+            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+            q14s16 = vsubq_s16(q8s16, q5s16);
+            q10s16 = vaddq_s16(q8s16, q5s16);
+            q11s16 = vaddq_s16(q9s16, q0s16);
+            q0s16 = vsubq_s16(q9s16, q0s16);
+            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+                                                         &q1s16, &q0s16);
+            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+            // -----------------------------------------
+            // BLOCK C: 8-10,11-15
+            // -----------------------------------------
+            // generate 8,9,14,15
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(3, 2, 30)
+            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(30, 18, 14)
+            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+            // part of stage 3
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+            // generate 10,11,12,13
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(14, 10, 22)
+            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(22, 26, 6)
+            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+            // part of stage 3
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16 = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16 = vaddq_s16(q6s16, q7s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+            // part of stage 5
+            q8s16 = vaddq_s16(q0s16, q5s16);
+            q9s16 = vaddq_s16(q1s16, q7s16);
+            q13s16 = vsubq_s16(q1s16, q7s16);
+            q14s16 = vsubq_s16(q3s16, q4s16);
+            q10s16 = vaddq_s16(q3s16, q4s16);
+            q15s16 = vaddq_s16(q2s16, q6s16);
+            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+            // part of stage 6
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+            q13s16 = vsubq_s16(q0s16, q5s16);
+            q14s16 = vsubq_s16(q2s16, q6s16);
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+            // -----------------------------------------
+            // BLOCK D: 0-3,4-7
+            // -----------------------------------------
+            // generate 4,5,6,7
+            // part of stage 3
+            LOAD_FROM_TRANSPOSED(6, 4, 28)
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(28, 20, 12)
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+            // generate 0,1,2,3
+            // part of stage 4
+            LOAD_FROM_TRANSPOSED(12, 0, 16)
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(16, 8, 24)
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+            // part of stage 5
+            q4s16 = vaddq_s16(q7s16, q6s16);
+            q7s16 = vsubq_s16(q7s16, q6s16);
+            q6s16 = vsubq_s16(q5s16, q14s16);
+            q5s16 = vaddq_s16(q5s16, q14s16);
+            // part of stage 6
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q3s16);
+            q10s16 = vaddq_s16(q6s16, q1s16);
+            q11s16 = vaddq_s16(q7s16, q0s16);
+            q12s16 = vsubq_s16(q7s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q1s16);
+            q14s16 = vsubq_s16(q5s16, q3s16);
+            q15s16 = vsubq_s16(q4s16, q2s16);
+            // part of stage 7
+            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+            q2s16 = vaddq_s16(q8s16, q1s16);
+            q3s16 = vaddq_s16(q9s16, q0s16);
+            q4s16 = vsubq_s16(q9s16, q0s16);
+            q5s16 = vsubq_s16(q8s16, q1s16);
+            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+            q8s16 = vaddq_s16(q4s16, q1s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q6s16 = vsubq_s16(q5s16, q0s16);
+            q7s16 = vsubq_s16(q4s16, q1s16);
+
+            if (idct32_pass_loop == 0) {
+                idct32_bands_end_1st_pass(out,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+            } else {
+                idct32_bands_end_2nd_pass(out, dest, stride,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+                dest += 8;
+            }
+        }
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
similarity index 91%
rename from libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
index 0d4a721..adab715 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm

@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_idct4x4_1_add_neon|
+    EXPORT  |vpx_idct4x4_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct4x4_1_add_neon| PROC
+|vpx_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
     vst1.32          {d7[1]}, [r12]
 
     bx               lr
-    ENDP             ; |vp9_idct4x4_1_add_neon|
+    ENDP             ; |vpx_idct4x4_1_add_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 0000000..ea61870
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c

@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct4x4_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d6u8;
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint16x8_t q8u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 4);
+
+    q0s16 = vdupq_n_s16(a1);
+
+    // dc_only_idct_add
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+        d1 += dest_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+        d1 += dest_stride;
+
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+                         vreinterpret_u8_u32(d2u32));
+        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+        d2 += dest_stride;
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+        d2 += dest_stride;
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm b/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
similarity index 96%
rename from libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index 00283fc..877fbd6 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm

@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_idct4x4_16_add_neon|
+    EXPORT  |vpx_idct4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -16,13 +16,13 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct4x4_16_add_neon| PROC
+|vpx_idct4x4_16_add_neon| PROC
 
     ; The 2D transform is done with two passes which are actually pretty
     ; similar. We first transform the rows. This is done by transposing
@@ -185,6 +185,6 @@
     vst1.32 {d26[1]}, [r1], r2
     vst1.32 {d26[0]}, [r1]  ; no post-increment
     bx              lr
-    ENDP  ; |vp9_idct4x4_16_add_neon|
+    ENDP  ; |vpx_idct4x4_16_add_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 0000000..3c975c9
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct4x4_add_neon.c

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vpx_idct4x4_16_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d26u8, d27u8;
+    uint32x2_t d26u32, d27u32;
+    uint16x8_t q8u16, q9u16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+    int16x8_t q8s16, q9s16, q13s16, q14s16;
+    int32x4_t q1s32, q13s32, q14s32, q15s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+    uint8_t *d;
+    int16_t cospi_8_64 = 15137;
+    int16_t cospi_16_64 = 11585;
+    int16_t cospi_24_64 = 6270;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    d20s16 = vdup_n_s16(cospi_8_64);
+    d21s16 = vdup_n_s16(cospi_16_64);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    d22s16 = vdup_n_s16(cospi_24_64);
+
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+    d19s16 = vget_low_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    // do the transform on columns
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d = dest;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+    d += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    d = dest;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
similarity index 94%
rename from libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
index 421d202..dbbff36 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm

@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_idct8x8_1_add_neon|
+    EXPORT  |vpx_idct8x8_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_1_add_neon| PROC
+|vpx_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -83,6 +83,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_idct8x8_1_add_neon|
+    ENDP             ; |vpx_idct8x8_1_add_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 0000000..c1b801f
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c

@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct8x8_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 5);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d4u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d5u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+
+        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+        d2 += dest_stride;
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
similarity index 97%
rename from libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
rename to libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
index ab5bb69..6ab59b4 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm

@@ -8,8 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_idct8x8_64_add_neon|
-    EXPORT  |vp9_idct8x8_12_add_neon|
+    EXPORT  |vpx_idct8x8_64_add_neon|
+    EXPORT  |vpx_idct8x8_12_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -198,13 +198,13 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_64_add_neon| PROC
+|vpx_idct8x8_64_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -308,15 +308,15 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_idct8x8_64_add_neon|
+    ENDP  ; |vpx_idct8x8_64_add_neon|
 
-;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_12_add_neon| PROC
+|vpx_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_idct8x8_12_add_neon|
+    ENDP  ; |vpx_idct8x8_12_add_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 0000000..4b2c2a6
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/idct8x8_add_neon.c

@@ -0,0 +1,540 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16 = vaddq_s16(q0s16, q7s16);
+    *q9s16 = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+void vpx_idct8x8_64_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}
+
+void vpx_idct8x8_12_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // First transform rows
+    // stage 1
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+    q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+    q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+    // stage 2 & stage 3 - even half
+    q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+    q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+    q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+    q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+    // stage 3 -odd half
+    q0s16 = vaddq_s16(q9s16, q15s16);
+    q1s16 = vaddq_s16(q9s16, q13s16);
+    q2s16 = vsubq_s16(q9s16, q13s16);
+    q3s16 = vsubq_s16(q9s16, q15s16);
+
+    // stage 2 - odd half
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    q8s16 = vaddq_s16(q0s16, q7s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q7s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}

diff --git a/libvpx/vpx_dsp/arm/intrapred_neon.c b/libvpx/vpx_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000..0a37610
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/intrapred_neon.c

@@ -0,0 +1,822 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+  const uint8x8_t A0 = vld1_u8(above);  // top row
+  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t avg1 = vhadd_u8(A0, A2);
+  uint8x8_t row = vrhadd_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 7; ++i) {
+    vst1_u8(dst + i * stride, row);
+    row = vtbl1_u8(row, sh_12345677);
+  }
+  vst1_u8(dst + i * stride, row);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0 = vld1q_u8(above);  // top row
+  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 15; ++i) {
+    vst1q_u8(dst + i * stride, row);
+    row = vextq_u8(row, above_right, 1);
+  }
+  vst1q_u8(dst + i * stride, row);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride)
+    vst1_u8(dst, d0u8);
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride)
+    vst1q_u8(dst, q0u8);
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint16x8_t q1u16, q3u16;
+  int16x8_t q1s16;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d2u32 = vdup_n_u32(0);
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+  for (i = 0; i < 4; i++, dst += stride) {
+    q1u16 = vdupq_n_u16((uint16_t)left[i]);
+    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+                      vreinterpretq_s16_u16(q3u16));
+    d0u8 = vqmovun_s16(q1s16);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  }
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint16x8_t q0u16, q3u16, q10u16;
+  int16x8_t q0s16;
+  uint16x4_t d20u16;
+  uint8x8_t d0u8, d2u8, d30u8;
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d30u8 = vld1_u8(left);
+  d2u8 = vld1_u8(above);
+  q10u16 = vmovl_u8(d30u8);
+  q3u16 = vsubl_u8(d2u8, d0u8);
+  d20u16 = vget_low_u16(q10u16);
+  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+    q0u16 = vdupq_lane_u16(d20u16, 0);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 1);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 2);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 3);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+  }
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+  uint8x16_t q0u8, q1u8;
+  int16x8_t q0s16, q1s16, q8s16, q11s16;
+  uint16x4_t d20u16;
+  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  for (k = 0; k < 2; k++, left += 8) {
+    d18u8 = vld1_u8(left);
+    q10u16 = vmovl_u8(d18u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+      q0u16 = vdupq_lane_u16(d20u16, 0);
+      q8u16 = vdupq_lane_u16(d20u16, 1);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d20u16, 2);
+      q8u16 = vdupq_lane_u16(d20u16, 3);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+  uint8x16_t q0u8, q1u8, q2u8;
+  int16x8_t q12s16, q13s16, q14s16, q15s16;
+  uint16x4_t d6u16;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u8 = vld1q_u8(above + 16);
+  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+  for (k = 0; k < 4; k++, left += 8) {
+    d26u8 = vld1_u8(left);
+    q3u16 = vmovl_u8(d26u8);
+    d6u16 = vget_low_u16(q3u16);
+    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+      q0u16 = vdupq_lane_u16(d6u16, 0);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 1);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 2);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 3);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+    }
+  }
+}
+#endif  // !HAVE_NEON_ASM

diff --git a/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
similarity index 85%
rename from libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
rename to libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
index dc9856f..115790d 100644
--- a/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm

@@ -8,25 +8,25 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_v_predictor_4x4_neon|
-    EXPORT  |vp9_v_predictor_8x8_neon|
-    EXPORT  |vp9_v_predictor_16x16_neon|
-    EXPORT  |vp9_v_predictor_32x32_neon|
-    EXPORT  |vp9_h_predictor_4x4_neon|
-    EXPORT  |vp9_h_predictor_8x8_neon|
-    EXPORT  |vp9_h_predictor_16x16_neon|
-    EXPORT  |vp9_h_predictor_32x32_neon|
-    EXPORT  |vp9_tm_predictor_4x4_neon|
-    EXPORT  |vp9_tm_predictor_8x8_neon|
-    EXPORT  |vp9_tm_predictor_16x16_neon|
-    EXPORT  |vp9_tm_predictor_32x32_neon|
+    EXPORT  |vpx_v_predictor_4x4_neon|
+    EXPORT  |vpx_v_predictor_8x8_neon|
+    EXPORT  |vpx_v_predictor_16x16_neon|
+    EXPORT  |vpx_v_predictor_32x32_neon|
+    EXPORT  |vpx_h_predictor_4x4_neon|
+    EXPORT  |vpx_h_predictor_8x8_neon|
+    EXPORT  |vpx_h_predictor_16x16_neon|
+    EXPORT  |vpx_h_predictor_32x32_neon|
+    EXPORT  |vpx_tm_predictor_4x4_neon|
+    EXPORT  |vpx_tm_predictor_8x8_neon|
+    EXPORT  |vpx_tm_predictor_16x16_neon|
+    EXPORT  |vpx_tm_predictor_32x32_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -34,16 +34,16 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_v_predictor_4x4_neon| PROC
+|vpx_v_predictor_4x4_neon| PROC
     vld1.32             {d0[0]}, [r2]
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_v_predictor_4x4_neon|
+    ENDP                ; |vpx_v_predictor_4x4_neon|
 
-;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -51,7 +51,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_v_predictor_8x8_neon| PROC
+|vpx_v_predictor_8x8_neon| PROC
     vld1.8              {d0}, [r2]
     vst1.8              {d0}, [r0], r1
     vst1.8              {d0}, [r0], r1
@@ -62,9 +62,9 @@
     vst1.8              {d0}, [r0], r1
     vst1.8              {d0}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_v_predictor_8x8_neon|
+    ENDP                ; |vpx_v_predictor_8x8_neon|
 
-;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -72,7 +72,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_v_predictor_16x16_neon| PROC
+|vpx_v_predictor_16x16_neon| PROC
     vld1.8              {q0}, [r2]
     vst1.8              {q0}, [r0], r1
     vst1.8              {q0}, [r0], r1
@@ -91,9 +91,9 @@
     vst1.8              {q0}, [r0], r1
     vst1.8              {q0}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_v_predictor_16x16_neon|
+    ENDP                ; |vpx_v_predictor_16x16_neon|
 
-;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -101,7 +101,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_v_predictor_32x32_neon| PROC
+|vpx_v_predictor_32x32_neon| PROC
     vld1.8              {q0, q1}, [r2]
     mov                 r2, #2
 loop_v
@@ -124,9 +124,9 @@
     subs                r2, r2, #1
     bgt                 loop_v
     bx                  lr
-    ENDP                ; |vp9_v_predictor_32x32_neon|
+    ENDP                ; |vpx_v_predictor_32x32_neon|
 
-;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -134,7 +134,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_h_predictor_4x4_neon| PROC
+|vpx_h_predictor_4x4_neon| PROC
     vld1.32             {d1[0]}, [r3]
     vdup.8              d0, d1[0]
     vst1.32             {d0[0]}, [r0], r1
@@ -145,9 +145,9 @@
     vdup.8              d0, d1[3]
     vst1.32             {d0[0]}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_h_predictor_4x4_neon|
+    ENDP                ; |vpx_h_predictor_4x4_neon|
 
-;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -155,7 +155,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_h_predictor_8x8_neon| PROC
+|vpx_h_predictor_8x8_neon| PROC
     vld1.64             {d1}, [r3]
     vdup.8              d0, d1[0]
     vst1.64             {d0}, [r0], r1
@@ -174,9 +174,9 @@
     vdup.8              d0, d1[7]
     vst1.64             {d0}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_h_predictor_8x8_neon|
+    ENDP                ; |vpx_h_predictor_8x8_neon|
 
-;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -184,7 +184,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_h_predictor_16x16_neon| PROC
+|vpx_h_predictor_16x16_neon| PROC
     vld1.8              {q1}, [r3]
     vdup.8              q0, d2[0]
     vst1.8              {q0}, [r0], r1
@@ -219,9 +219,9 @@
     vdup.8              q0, d3[7]
     vst1.8              {q0}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_h_predictor_16x16_neon|
+    ENDP                ; |vpx_h_predictor_16x16_neon|
 
-;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -229,7 +229,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_h_predictor_32x32_neon| PROC
+|vpx_h_predictor_32x32_neon| PROC
     sub                 r1, r1, #16
     mov                 r2, #2
 loop_h
@@ -285,9 +285,9 @@
     subs                r2, r2, #1
     bgt                 loop_h
     bx                  lr
-    ENDP                ; |vp9_h_predictor_32x32_neon|
+    ENDP                ; |vpx_h_predictor_32x32_neon|
 
-;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -295,11 +295,10 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_tm_predictor_4x4_neon| PROC
+|vpx_tm_predictor_4x4_neon| PROC
     ; Load ytop_left = above[-1];
     sub                 r12, r2, #1
-    ldrb                r12, [r12]
-    vdup.u8             d0, r12
+    vld1.u8             {d0[]}, [r12]
 
     ; Load above 4 pixels
     vld1.32             {d2[0]}, [r2]
@@ -309,10 +308,10 @@
 
     ; Load left row by row and compute left + (above - ytop_left)
     ; 1st row and 2nd row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]!
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
     vqmovun.s16         d0, q1
@@ -321,10 +320,10 @@
     vst1.32             {d1[0]}, [r0], r1
 
     ; 3rd row and 4th row
-    ldrb                r12, [r3], #1
-    ldrb                r2, [r3], #1
-    vdup.u16            q1, r12
-    vdup.u16            q2, r2
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
     vqmovun.s16         d0, q1
@@ -332,9 +331,9 @@
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
-    ENDP                ; |vp9_tm_predictor_4x4_neon|
+    ENDP                ; |vpx_tm_predictor_4x4_neon|
 
-;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -342,11 +341,10 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_tm_predictor_8x8_neon| PROC
+|vpx_tm_predictor_8x8_neon| PROC
     ; Load ytop_left = above[-1];
     sub                 r12, r2, #1
-    ldrb                r12, [r12]
-    vdup.u8             d0, r12
+    vld1.8              {d0[]}, [r12]
 
     ; preload 8 left
     vld1.8              {d30}, [r3]
@@ -405,9 +403,9 @@
     vst1.64             {d3}, [r0], r1
 
     bx                  lr
-    ENDP                ; |vp9_tm_predictor_8x8_neon|
+    ENDP                ; |vpx_tm_predictor_8x8_neon|
 
-;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -415,11 +413,10 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_tm_predictor_16x16_neon| PROC
+|vpx_tm_predictor_16x16_neon| PROC
     ; Load ytop_left = above[-1];
     sub                 r12, r2, #1
-    ldrb                r12, [r12]
-    vdup.u8             q0, r12
+    vld1.8              {d0[]}, [r12]
 
     ; Load above 8 pixels
     vld1.8              {q1}, [r2]
@@ -429,7 +426,7 @@
 
     ; Compute above - ytop_left
     vsubl.u8            q2, d2, d0
-    vsubl.u8            q3, d3, d1
+    vsubl.u8            q3, d3, d0
 
     vmovl.u8            q10, d18
 
@@ -499,9 +496,9 @@
     bgt                 loop_16x16_neon
 
     bx                  lr
-    ENDP                ; |vp9_tm_predictor_16x16_neon|
+    ENDP                ; |vpx_tm_predictor_16x16_neon|
 
-;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                  const uint8_t *above,
 ;                                  const uint8_t *left)
 ; r0  uint8_t *dst
@@ -509,11 +506,10 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
 
-|vp9_tm_predictor_32x32_neon| PROC
+|vpx_tm_predictor_32x32_neon| PROC
     ; Load ytop_left = above[-1];
     sub                 r12, r2, #1
-    ldrb                r12, [r12]
-    vdup.u8             q0, r12
+    vld1.8              {d0[]}, [r12]
 
     ; Load above 32 pixels
     vld1.8              {q1}, [r2]!
@@ -524,9 +520,9 @@
 
     ; Compute above - ytop_left
     vsubl.u8            q8, d2, d0
-    vsubl.u8            q9, d3, d1
+    vsubl.u8            q9, d3, d0
     vsubl.u8            q10, d4, d0
-    vsubl.u8            q11, d5, d1
+    vsubl.u8            q11, d5, d0
 
     vmovl.u8            q3, d26
 
@@ -629,6 +625,6 @@
     bgt                 loop_32x32_neon
 
     bx                  lr
-    ENDP                ; |vp9_tm_predictor_32x32_neon|
+    ENDP                ; |vpx_tm_predictor_32x32_neon|
 
     END

diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
similarity index 94%
rename from libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
rename to libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
index 5b8ec20..5a8fdd6 100644
--- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm

@@ -8,12 +8,12 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
 ;                                    const uint8_t *limit0,
 ;                                    const uint8_t *thresh0,
@@ -29,7 +29,7 @@
 ; sp+8  const uint8_t *limit1,
 ; sp+12 const uint8_t *thresh1,
 
-|vp9_lpf_horizontal_4_dual_neon| PROC
+|vpx_lpf_horizontal_4_dual_neon| PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              ; load thresh0
@@ -66,7 +66,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon_16
+    bl          vpx_loop_filter_neon_16
 
     vst1.u8     {q5}, [r2@64], r1          ; store op1
     vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -76,9 +76,9 @@
     vpop        {d8-d15}                   ; restore neon registers
 
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_dual_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
 
-; void vp9_loop_filter_neon_16();
+; void vpx_loop_filter_neon_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -101,7 +101,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vp9_loop_filter_neon_16| PROC
+|vpx_loop_filter_neon_16| PROC
 
     ; filter_mask
     vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -194,6 +194,6 @@
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon_16|
+    ENDP        ; |vpx_loop_filter_neon_16|
 
     END

diff --git a/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
new file mode 100644
index 0000000..d24e6ad
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/loopfilter_16_neon.c

@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE void loop_filter_neon_16(
+        uint8x16_t qblimit,  // blimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3 = vabdq_u8(q9, q8);
+    q4 = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3 = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vdupq_n_u16(3);
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q4 = vdupq_n_u8(3);
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+    return;
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+    uint8x16_t qblimit, qlimit, qthresh;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+    dblimit0 = vld1_u8(blimit0);
+    dlimit0 = vld1_u8(limit0);
+    dthresh0 = vld1_u8(thresh0);
+    dblimit1 = vld1_u8(blimit1);
+    dlimit1 = vld1_u8(limit1);
+    dthresh1 = vld1_u8(thresh1);
+    qblimit = vcombine_u8(dblimit0, dblimit1);
+    qlimit = vcombine_u8(dlimit0, dlimit1);
+    qthresh = vcombine_u8(dthresh0, dthresh1);
+
+    s -= (p << 2);
+
+    q3u8 = vld1q_u8(s);
+    s += p;
+    q4u8 = vld1q_u8(s);
+    s += p;
+    q5u8 = vld1q_u8(s);
+    s += p;
+    q6u8 = vld1q_u8(s);
+    s += p;
+    q7u8 = vld1q_u8(s);
+    s += p;
+    q8u8 = vld1q_u8(s);
+    s += p;
+    q9u8 = vld1q_u8(s);
+    s += p;
+    q10u8 = vld1q_u8(s);
+
+    loop_filter_neon_16(qblimit, qlimit, qthresh,
+                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                        &q5u8, &q6u8, &q7u8, &q8u8);
+
+    s -= (p * 5);
+    vst1q_u8(s, q5u8);
+    s += p;
+    vst1q_u8(s, q6u8);
+    s += p;
+    vst1q_u8(s, q7u8);
+    s += p;
+    vst1q_u8(s, q8u8);
+    return;
+}

diff --git a/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 0000000..e45e34c
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vpx_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vpx_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vpx_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vpx_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vpx_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vpx_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vpx_lpf_vertical_4_neon|
+
+; void vpx_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vpx_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vpx_loop_filter_neon|
+
+    END

diff --git a/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
new file mode 100644
index 0000000..7ad411a
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/loopfilter_4_neon.c

@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(
+        uint8x8_t dblimit,    // flimit
+        uint8x8_t dlimit,     // limit
+        uint8x8_t dthresh,    // thresh
+        uint8x8_t d3u8,       // p3
+        uint8x8_t d4u8,       // p2
+        uint8x8_t d5u8,       // p1
+        uint8x8_t d6u8,       // p0
+        uint8x8_t d7u8,       // q0
+        uint8x8_t d16u8,      // q1
+        uint8x8_t d17u8,      // q2
+        uint8x8_t d18u8,      // q3
+        uint8x8_t *d4ru8,     // p1
+        uint8x8_t *d5ru8,     // p0
+        uint8x8_t *d6ru8,     // q0
+        uint8x8_t *d7ru8) {   // q1
+    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+    int16x8_t q12s16;
+    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d3u8  = vabd_u8(d17u8, d16u8);
+    d4u8  = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+    d3u8  = vmax_u8(d3u8,  d4u8);
+    d23u8 = vmax_u8(d19u8, d20u8);
+
+    d17u8 = vabd_u8(d6u8, d7u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+    d22u8 = vcgt_u8(d22u8, dthresh);
+    d23u8 = vmax_u8(d23u8, d3u8);
+
+    d28u8 = vabd_u8(d5u8, d16u8);
+    d17u8 = vqadd_u8(d17u8, d17u8);
+
+    d23u8 = vcge_u8(dlimit, d23u8);
+
+    d18u8 = vdup_n_u8(0x80);
+    d5u8  = veor_u8(d5u8,  d18u8);
+    d6u8  = veor_u8(d6u8,  d18u8);
+    d7u8  = veor_u8(d7u8,  d18u8);
+    d16u8 = veor_u8(d16u8, d18u8);
+
+    d28u8 = vshr_n_u8(d28u8, 1);
+    d17u8 = vqadd_u8(d17u8, d28u8);
+
+    d19u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+                    vreinterpret_s8_u8(d6u8));
+
+    d17u8 = vcge_u8(dblimit, d17u8);
+
+    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+                     vreinterpret_s8_u8(d16u8));
+
+    d22u8 = vorr_u8(d21u8, d22u8);
+
+    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+    d23u8 = vand_u8(d23u8, d17u8);
+
+    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+    d17u8 = vdup_n_u8(4);
+
+    d27s8 = vqmovn_s16(q12s16);
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+    d27s8 = vreinterpret_s8_u8(d27u8);
+
+    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+    d28s8 = vshr_n_s8(d28s8, 3);
+    d27s8 = vshr_n_s8(d27s8, 3);
+
+    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+    d27s8 = vrshr_n_s8(d27s8, 1);
+    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+    return;
+}
+
+void vpx_lpf_horizontal_4_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vpx_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
+
+        s -= (pitch * 5);
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+        s += pitch;
+        vst1_u8(s, d6u8);
+        s += pitch;
+        vst1_u8(s, d7u8);
+    }
+    return;
+}
+
+void vpx_lpf_vertical_4_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i, pitch8;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+
+    if (count == 0)  // end_vpx_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    pitch8 = pitch * 8;
+    for (i = 0; i < count; i++, src += pitch8) {
+        s = src - (i + 1) * 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                      vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                      vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                      vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                      vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
+
+        d4Result.val[0] = d4u8;
+        d4Result.val[1] = d5u8;
+        d4Result.val[2] = d6u8;
+        d4Result.val[3] = d7u8;
+
+        src -= 2;
+        vst4_lane_u8(src, d4Result, 0);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 1);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 2);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 3);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 4);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 5);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 6);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 7);
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
similarity index 62%
rename from libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
rename to libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
index 4430322..e81734c 100644
--- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm

@@ -8,275 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
-    EXPORT  |vp9_lpf_horizontal_8_neon|
-    EXPORT  |vp9_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-;                                int p /* pitch */,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_lf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_lf_h_loop
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vp9_lf_h_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-;                              int p /* pitch */,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_lf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_lf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vp9_lf_v_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|vp9_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
-
-; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh,
@@ -287,7 +30,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_8_neon| PROC
+|vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -296,7 +39,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
+    beq         end_vpx_mblf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -317,7 +60,7 @@
     sub         r3, r3, r1, lsl #1
     sub         r2, r2, r1, lsl #2
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     vst1.u8     {d0}, [r2@64], r1          ; store op2
     vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -330,12 +73,12 @@
     subs        r12, r12, #1
     bne         count_mblf_h_loop
 
-end_vp9_mblf_h_edge
+end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_8_neon|
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
 
-; void vp9_lpf_vertical_8_neon(uint8_t *s,
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -348,7 +91,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_8_neon| PROC
+|vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -358,7 +101,7 @@
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
+    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -391,7 +134,7 @@
     sub         r2, r0, #3
     add         r3, r0, #1
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     ;store op2, op1, op0, oq0
     vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -418,11 +161,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_mblf_v_loop
 
-end_vp9_mblf_v_edge
+end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
-    ENDP        ; |vp9_lpf_vertical_8_neon|
+    ENDP        ; |vpx_lpf_vertical_8_neon|
 
-; void vp9_mbloop_filter_neon();
+; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -448,7 +191,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vp9_mbloop_filter_neon| PROC
+|vpx_mbloop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -703,6 +446,6 @@
 
     bx          lr
 
-    ENDP        ; |vp9_mbloop_filter_neon|
+    ENDP        ; |vpx_mbloop_filter_neon|
 
     END

diff --git a/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
new file mode 100644
index 0000000..a887e2e
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/loopfilter_8_neon.c

@@ -0,0 +1,453 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(
+        uint8x8_t dblimit,   // mblimit
+        uint8x8_t dlimit,    // limit
+        uint8x8_t dthresh,   // thresh
+        uint8x8_t d3u8,      // p2
+        uint8x8_t d4u8,      // p2
+        uint8x8_t d5u8,      // p1
+        uint8x8_t d6u8,      // p0
+        uint8x8_t d7u8,      // q0
+        uint8x8_t d16u8,     // q1
+        uint8x8_t d17u8,     // q2
+        uint8x8_t d18u8,     // q3
+        uint8x8_t *d0ru8,    // p1
+        uint8x8_t *d1ru8,    // p1
+        uint8x8_t *d2ru8,    // p0
+        uint8x8_t *d3ru8,    // q0
+        uint8x8_t *d4ru8,    // q1
+        uint8x8_t *d5ru8) {  // q1
+    uint32_t flat;
+    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int16x8_t q15s16;
+    uint16x8_t q10u16, q14u16;
+    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d23u8 = vabd_u8(d17u8, d16u8);
+    d24u8 = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+
+    d25u8 = vabd_u8(d6u8, d4u8);
+
+    d23u8 = vmax_u8(d23u8, d24u8);
+
+    d26u8 = vabd_u8(d7u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+
+    d24u8 = vabd_u8(d6u8, d7u8);
+    d27u8 = vabd_u8(d3u8, d6u8);
+    d28u8 = vabd_u8(d18u8, d7u8);
+
+    d19u8 = vmax_u8(d19u8, d23u8);
+
+    d23u8 = vabd_u8(d5u8, d16u8);
+    d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+    d19u8 = vcge_u8(dlimit, d19u8);
+
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+    d26u8 = vmax_u8(d27u8, d28u8);
+
+    d23u8 = vshr_n_u8(d23u8, 1);
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+
+    d24u8 = vqadd_u8(d24u8, d23u8);
+
+    d20u8 = vmax_u8(d20u8, d25u8);
+
+    d23u8 = vdup_n_u8(1);
+    d24u8 = vcge_u8(dblimit, d24u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+
+    d20u8 = vcge_u8(d23u8, d20u8);
+
+    d19u8 = vand_u8(d19u8, d24u8);
+
+    d23u8 = vcgt_u8(d22u8, dthresh);
+
+    d20u8 = vand_u8(d20u8, d19u8);
+
+    d22u8 = vdup_n_u8(0x80);
+
+    d23u8 = vorr_u8(d21u8, d23u8);
+
+    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+                          vreinterpret_u16_u8(d21u8));
+
+    d30u8 = vshrn_n_u16(q10u16, 4);
+    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+        d27u8 = vdup_n_u8(3);
+        d21u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+    } else {
+        d21u8 = veor_u8(d7u8,  d22u8);
+        d24u8 = veor_u8(d6u8,  d22u8);
+        d25u8 = veor_u8(d5u8,  d22u8);
+        d26u8 = veor_u8(d16u8, d22u8);
+
+        d27u8 = vdup_n_u8(3);
+
+        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        q15s16 = vaddw_s8(q15s16, d29s8);
+
+        d29u8 = vdup_n_u8(4);
+
+        d28s8 = vqmovn_s16(q15s16);
+
+        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+        d30s8 = vshr_n_s8(d30s8, 3);
+        d29s8 = vshr_n_s8(d29s8, 3);
+
+        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+        d29s8 = vrshr_n_s8(d29s8, 1);
+        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+        if (flat == 0) {  // filter_branch_only
+            *d0ru8 = d4u8;
+            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+            *d5ru8 = d17u8;
+            return;
+        }
+
+        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+        d23u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+        q14u16 = vaddw_u8(q14u16, d5u8);
+
+        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+        d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+        d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+
+        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+        d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+
+        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+        d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+        d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+        d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+    }
+    return;
+}
+
+void vpx_lpf_horizontal_8_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vpx_mblf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8  = vld1_u8(s);
+        s += pitch;
+        d4u8  = vld1_u8(s);
+        s += pitch;
+        d5u8  = vld1_u8(s);
+        s += pitch;
+        d6u8  = vld1_u8(s);
+        s += pitch;
+        d7u8  = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        s -= (pitch * 6);
+        vst1_u8(s, d0u8);
+        s += pitch;
+        vst1_u8(s, d1u8);
+        s += pitch;
+        vst1_u8(s, d2u8);
+        s += pitch;
+        vst1_u8(s, d3u8);
+        s += pitch;
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+    }
+    return;
+}
+
+void vpx_lpf_vertical_8_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+    uint8x8x2_t d2Result;
+
+    if (count == 0)
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    for (i = 0; i < count; i++) {
+        s = src + (i * (pitch << 3)) - 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                          vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                          vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                          vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                          vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        d4Result.val[0] = d0u8;
+        d4Result.val[1] = d1u8;
+        d4Result.val[2] = d2u8;
+        d4Result.val[3] = d3u8;
+
+        d2Result.val[0] = d4u8;
+        d2Result.val[1] = d5u8;
+
+        s = src - 3;
+        vst4_lane_u8(s, d4Result, 0);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 1);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 2);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 3);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 4);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 5);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 6);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 7);
+
+        s = src + 1;
+        vst2_lane_u8(s, d2Result, 0);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 1);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 2);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 3);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 4);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 5);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 6);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 7);
+    }
+    return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
similarity index 96%
rename from libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
rename to libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
index 5fe2bba..20d9cfb 100644
--- a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,13 +8,13 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_16_neon|
-    EXPORT  |vp9_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
 ;                                 const uint8_t *blimit,
 ;                                 const uint8_t *limit,
 ;                                 const uint8_t *thresh
@@ -24,7 +24,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_horizontal_16_neon| PROC
+|vpx_lpf_horizontal_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -54,7 +54,7 @@
     vld1.u8     {d14}, [r8@64], r1         ; q6
     vld1.u8     {d15}, [r8@64], r1         ; q7
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         h_mbfilter
@@ -115,9 +115,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_16_neon|
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
 
-; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
 ;                               const uint8_t *limit,
 ;                               const uint8_t *thresh)
@@ -126,7 +126,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_vertical_16_neon| PROC
+|vpx_lpf_vertical_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -176,7 +176,7 @@
     vtrn.8      d12, d13
     vtrn.8      d14, d15
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         v_mbfilter
@@ -279,9 +279,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_vertical_16_neon|
+    ENDP        ; |vpx_lpf_vertical_16_neon|
 
-; void vp9_wide_mbfilter_neon();
+; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -305,7 +305,7 @@
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vp9_wide_mbfilter_neon| PROC
+|vpx_wide_mbfilter_neon| PROC
     mov         r7, #0
 
     ; filter_mask
@@ -601,6 +601,6 @@
     vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
+    ENDP        ; |vpx_wide_mbfilter_neon|
 
     END

diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/libvpx/vpx_dsp/arm/loopfilter_neon.c
similarity index 63%
rename from libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
rename to libvpx/vpx_dsp/arm/loopfilter_neon.c
index bc6a17c..eff87d2 100644
--- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/libvpx/vpx_dsp/arm/loopfilter_neon.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,46 +8,51 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+#if HAVE_NEON_ASM
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
                                    const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
+#endif  // HAVE_NEON_ASM

diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 0000000..c7704dc
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c

@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+                                        vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+                                        vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}

diff --git a/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/libvpx/vpx_dsp/arm/sad_media.asm
similarity index 96%
rename from libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
rename to libvpx/vpx_dsp/arm/sad_media.asm
index 1b4f5cf..aed1d3a 100644
--- a/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/libvpx/vpx_dsp/arm/sad_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_sad16x16_armv6|
+    EXPORT  |vpx_sad16x16_media|
 
     ARM
     REQUIRE8
@@ -21,8 +21,7 @@
 ; r1    int  src_stride
 ; r2    const unsigned char *ref_ptr
 ; r3    int  ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
+|vpx_sad16x16_media| PROC
     stmfd   sp!, {r4-r12, lr}
 
     pld     [r0, r1, lsl #0]

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
similarity index 65%
rename from libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c
rename to libvpx/vpx_dsp/arm/sad_neon.c
index c4cd856..173f08a 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c

@@ -9,11 +9,113 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
+
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
 
+unsigned int vpx_sad8x16_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 15; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vpx_sad4x4_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x2_t d1;
+    uint64x1_t d3;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 3; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    d1 = vpaddl_u16(vget_low_u16(q12));
+    d3 = vpaddl_u32(d1);
+
+    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vpx_sad16x8_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x16_t q0, q4;
+    uint16x8_t q12, q13;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+    for (i = 0; i < 7; i++) {
+        q0 = vld1q_u8(src_ptr);
+        src_ptr += src_stride;
+        q4 = vld1q_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+    }
+
+    q12 = vaddq_u16(q12, q13);
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
   const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
@@ -34,7 +136,7 @@
   return vget_lane_u32(c, 0);
 }
 
-unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -70,7 +172,7 @@
   return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
 }
 
-unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -95,7 +197,7 @@
   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
-unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -114,7 +216,7 @@
   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
-unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum = vdupq_n_u16(0);

diff --git a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm b/libvpx/vpx_dsp/arm/save_reg_neon.asm
similarity index 88%
rename from libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
rename to libvpx/vpx_dsp/arm/save_reg_neon.asm
index 71c3e70..c9ca108 100644
--- a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm
+++ b/libvpx/vpx_dsp/arm/save_reg_neon.asm

@@ -9,8 +9,8 @@
 ;
 
 
-    EXPORT  |vp9_push_neon|
-    EXPORT  |vp9_pop_neon|
+    EXPORT  |vpx_push_neon|
+    EXPORT  |vpx_pop_neon|
 
     ARM
     REQUIRE8
@@ -18,14 +18,14 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-|vp9_push_neon| PROC
+|vpx_push_neon| PROC
     vst1.i64            {d8, d9, d10, d11}, [r0]!
     vst1.i64            {d12, d13, d14, d15}, [r0]!
     bx              lr
 
     ENDP
 
-|vp9_pop_neon| PROC
+|vpx_pop_neon| PROC
     vld1.i64            {d8, d9, d10, d11}, [r0]!
     vld1.i64            {d12, d13, d14, d15}, [r0]!
     bx              lr

diff --git a/libvpx/vpx_dsp/arm/subpel_variance_media.c b/libvpx/vpx_dsp/arm/subpel_variance_media.c
new file mode 100644
index 0000000..e7d8c85
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/subpel_variance_media.c

@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = {
+  { 128,   0 },
+  { 112,  16 },
+  {  96,  32 },
+  {  80,  48 },
+  {  64,  64 },
+  {  48,  80 },
+  {  32,  96 },
+  {  16, 112 }
+};
+
+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
+                                                    uint16_t *dst_ptr,
+                                                    uint32_t src_pitch,
+                                                    uint32_t height,
+                                                    uint32_t width,
+                                                    const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
+                                                     uint8_t *dst_ptr,
+                                                     int32_t src_pitch,
+                                                     uint32_t height,
+                                                     uint32_t width,
+                                                     const int16_t *filter);
+
+
+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
+                                             int src_pixels_per_line,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  uint16_t first_pass[10*8];
+  uint8_t  second_pass[8*8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line,
+                                          9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                           8, 8, 8, VFilter);
+
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                               dst_pixels_per_line, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
+                                               int src_pixels_per_line,
+                                               int xoffset,
+                                               int yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse) {
+  uint16_t first_pass[36*16];
+  uint8_t  second_pass[20*16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
+                                                dst_ptr, dst_pixels_per_line,
+                                                sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                  dst_pixels_per_line, sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA

diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000..40e2cc8
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c

@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0, },
+  { 112,  16, },
+  {  96,  32, },
+  {  80,  48, },
+  {  64,  64, },
+  {  48,  80, },
+  {  32,  96, },
+  {  16, 112, },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
+                                            int src_stride,
+                                            int xoffset,
+                                            int yoffset,
+                                            const uint8_t *dst,
+                                            int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+                            9, 8,
+                            bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+                            8, bilinear_filters[yoffset]);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             17, 16,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+                             16, bilinear_filters[yoffset]);
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             33, 32,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+                             32, bilinear_filters[yoffset]);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             65, 64,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+                             64, bilinear_filters[yoffset]);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}

diff --git a/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c b/libvpx/vpx_dsp/arm/subtract_neon.c
similarity index 97%
rename from libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c
rename to libvpx/vpx_dsp/arm/subtract_neon.c
index b4bf567..7b14609 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c
+++ b/libvpx/vpx_dsp/arm/subtract_neon.c

@@ -9,12 +9,11 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
 
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_subtract_block_neon(int rows, int cols,
+void vpx_subtract_block_neon(int rows, int cols,
                              int16_t *diff, ptrdiff_t diff_stride,
                              const uint8_t *src, ptrdiff_t src_stride,
                              const uint8_t *pred, ptrdiff_t pred_stride) {

diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
similarity index 97%
rename from libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
rename to libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
index 3668dc5..dab845a 100644
--- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_h_armv6| PROC
+|vpx_variance_halfpixvar16x16_h_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
similarity index 98%
rename from libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
rename to libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
index b4e0959..01953b7 100644
--- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+|vpx_variance_halfpixvar16x16_hv_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
similarity index 97%
rename from libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
rename to libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
index 10863e2..0d17acb 100644
--- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_v_armv6| PROC
+|vpx_variance_halfpixvar16x16_v_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/libvpx/vpx_dsp/arm/variance_media.asm b/libvpx/vpx_dsp/arm/variance_media.asm
new file mode 100644
index 0000000..f7f9e14
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/variance_media.asm

@@ -0,0 +1,358 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |vpx_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END

diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 0000000..ede6e7b
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/variance_neon.c

@@ -0,0 +1,418 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
+                        const uint8_t *b, int b_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
+                          const uint8_t *b, int b_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride,
+                   b + (32 * b_stride), b_stride, 32, 32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+                   b + (16 * 2 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+                   b + (16 * 3 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d2u8, d4u8, d6u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d2u8, d6u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    int64x1_t d0s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    q7s32 = vdupq_n_s32(0);
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q10s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q10s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride) {
+    int16x4_t d22s16, d24s16, d26s16, d28s16;
+    int64x1_t d0s64;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d1u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d5u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d3u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d7u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d1u8, d5u8);
+    q13u16 = vsubl_u8(d2u8, d6u8);
+    q14u16 = vsubl_u8(d3u8, d7u8);
+
+    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+    q7s32 = vmull_s16(d22s16, d22s16);
+    q8s32 = vmull_s16(d24s16, d24s16);
+    q9s32 = vmull_s16(d26s16, d26s16);
+    q10s32 = vmull_s16(d28s16, d28s16);
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q9s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q9s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}

diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
new file mode 100644
index 0000000..8632250
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c

@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    src += 7;
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w;
+         width > 0;
+         width -= 4, src += 4, dst += 4) {  // loop_horiz
+      s = src;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(src + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(src + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(src + 64 + src_stride * 2);
+
+      d = dst;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(src + 64 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      d = dst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+    src += src_stride * 4 - w - 7;
+    dst += dst_stride * 4 - w;
+  }
+  return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  uint8x16_t q1u8, q3u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+      d -= dst_stride * 3;
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
similarity index 91%
rename from libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
rename to libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
index 6b20cb9..e279d57 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm

@@ -17,10 +17,8 @@
     ; VP9_FILTER_WEIGHT == 128
     ; VP9_FILTER_SHIFT == 7
 
-    EXPORT  |vp9_convolve8_avg_horiz_neon|
-    EXPORT  |vp9_convolve8_avg_vert_neon|
-    IMPORT  |vp9_convolve8_avg_horiz_c|
-    IMPORT  |vp9_convolve8_avg_vert_c|
+    EXPORT  |vpx_convolve8_avg_horiz_neon|
+    EXPORT  |vpx_convolve8_avg_vert_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -51,11 +49,7 @@
 ; sp[]int w
 ; sp[]int h
 
-|vp9_convolve8_avg_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_horiz_c
-
+|vpx_convolve8_avg_horiz_neon| PROC
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
@@ -78,7 +72,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vpx_convolve8_avg_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +95,7 @@
 
     add             r0, r0, #3
 
-loop_horiz
+vpx_convolve8_avg_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -170,24 +164,20 @@
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vpx_convolve8_avg_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vpx_convolve8_avg_loop_horiz_v
 
     pop             {r4-r10, pc}
 
     ENDP
 
-|vp9_convolve8_avg_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_avg_vert_c
-
+|vpx_convolve8_avg_vert_neon| PROC
     push            {r4-r8, lr}
 
     ; adjust for taps
@@ -203,7 +193,7 @@
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vpx_convolve8_avg_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -223,7 +213,7 @@
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vpx_convolve8_avg_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -288,13 +278,13 @@
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vpx_convolve8_avg_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vpx_convolve8_avg_loop_vert_h
 
     pop             {r4-r8, pc}
 

diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 0000000..9bd715e
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c

@@ -0,0 +1,340 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s, *psrc;
+  uint8_t *d, *pdst;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4,
+    src += src_stride * 4,
+    dst += dst_stride * 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+    __builtin_prefetch(src + src_stride * 6);
+
+    q8u16  = vmovl_u8(d0x2u8.val[0]);
+    q9u16  = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w, psrc = src + 7, pdst = dst;
+         width > 0;
+         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+      s = psrc;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(psrc + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(psrc + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+      d = pdst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+  }
+  return;
+}
+
+void vpx_convolve8_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint32x2_t d2u32, d3u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
similarity index 91%
rename from libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
rename to libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
index 4525845..2d0f2ae 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm

@@ -17,10 +17,8 @@
     ; VP9_FILTER_WEIGHT == 128
     ; VP9_FILTER_SHIFT == 7
 
-    EXPORT  |vp9_convolve8_horiz_neon|
-    EXPORT  |vp9_convolve8_vert_neon|
-    IMPORT  |vp9_convolve8_horiz_c|
-    IMPORT  |vp9_convolve8_vert_c|
+    EXPORT  |vpx_convolve8_horiz_neon|
+    EXPORT  |vpx_convolve8_vert_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -51,11 +49,7 @@
 ; sp[]int w
 ; sp[]int h
 
-|vp9_convolve8_horiz_neon| PROC
-    ldr             r12, [sp, #4]           ; x_step_q4
-    cmp             r12, #16
-    bne             vp9_convolve8_horiz_c
-
+|vpx_convolve8_horiz_neon| PROC
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
@@ -78,7 +72,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vpx_convolve8_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +95,7 @@
 
     add             r0, r0, #3
 
-loop_horiz
+vpx_convolve8_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -159,24 +153,20 @@
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vpx_convolve8_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vpx_convolve8_loop_horiz_v
 
     pop             {r4-r10, pc}
 
     ENDP
 
-|vp9_convolve8_vert_neon| PROC
-    ldr             r12, [sp, #12]
-    cmp             r12, #16
-    bne             vp9_convolve8_vert_c
-
+|vpx_convolve8_vert_neon| PROC
     push            {r4-r8, lr}
 
     ; adjust for taps
@@ -192,7 +182,7 @@
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vpx_convolve8_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -212,7 +202,7 @@
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vpx_convolve8_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -266,13 +256,13 @@
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vpx_convolve8_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vpx_convolve8_loop_vert_h
 
     pop             {r4-r8, pc}
 

diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 0000000..dc58a33
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c

@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8_t *d;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint32x2_t d0u32, d2u32;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  d = dst;
+  if (w > 32) {  // avg64
+    for (; h > 0; h -= 1) {
+      q0u8  = vld1q_u8(src);
+      q1u8  = vld1q_u8(src + 16);
+      q2u8  = vld1q_u8(src + 32);
+      q3u8  = vld1q_u8(src + 48);
+      src += src_stride;
+      q8u8  = vld1q_u8(d);
+      q9u8  = vld1q_u8(d + 16);
+      q10u8 = vld1q_u8(d + 32);
+      q11u8 = vld1q_u8(d + 48);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // avg32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+      q10u8 = vld1q_u8(d);
+      q11u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // avg16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+      q2u8 = vld1q_u8(d);
+      d += dst_stride;
+      q3u8 = vld1q_u8(d);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q2u8);
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // avg8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d1u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(d);
+      d += dst_stride;
+      d3u8 = vld1_u8(d);
+      d += dst_stride;
+
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+      vst1_u8(dst, vget_low_u8(q0u8));
+      dst += dst_stride;
+      vst1_u8(dst, vget_high_u8(q0u8));
+      dst += dst_stride;
+    }
+  } else {  // avg4
+    for (; h > 0; h -= 2) {
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      src += src_stride;
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      src += src_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+                       vreinterpret_u8_u32(d2u32));
+
+      d0u32 = vreinterpret_u32_u8(d0u8);
+      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      dst += dst_stride;
+    }
+  }
+  return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
similarity index 97%
rename from libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
rename to libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 7d24530..97e6189 100644
--- a/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm

@@ -8,14 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_convolve_avg_neon|
+    EXPORT  |vpx_convolve_avg_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-|vp9_convolve_avg_neon| PROC
+|vpx_convolve_avg_neon| PROC
     push                {r4-r6, lr}
     ldrd                r4, r5, [sp, #32]
     mov                 r6, r2

diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 0000000..d8fb97a
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c

@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8x8_t d0u8, d2u8;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  if (w > 32) {  // copy64
+    for (; h > 0; h--) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // copy32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // copy16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // copy8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, d0u8);
+      dst += dst_stride;
+      vst1_u8(dst, d2u8);
+      dst += dst_stride;
+    }
+  } else {  // copy4
+    for (; h > 0; h--) {
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+  return;
+}

diff --git a/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
similarity index 96%
rename from libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
rename to libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index a0bd04a..89164ad 100644
--- a/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm

@@ -8,14 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_convolve_copy_neon|
+    EXPORT  |vpx_convolve_copy_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-|vp9_convolve_copy_neon| PROC
+|vpx_convolve_copy_neon| PROC
     push                {r4-r5, lr}
     ldrd                r4, r5, [sp, #28]
 

diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
similarity index 69%
rename from libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
rename to libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index f0881b5..1506ce6 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c

@@ -8,11 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
@@ -20,62 +22,50 @@
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
   int intermediate_height = h + 7;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16) {
-    vp9_convolve8_c(src, src_stride,
-                    dst, dst_stride,
-                    filter_x, x_step_q4,
-                    filter_y, y_step_q4,
-                    w, h);
-    return;
-  }
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
 
   /* Filter starting 3 lines back. The neon implementation will ignore the
    * given height and filter a multiple of 4 lines. Since this goes in to
    * the temp buffer which has lots of extra room and is subsequently discarded
    * this is safe if somewhat less than ideal.
    */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
                            temp, 64,
                            filter_x, x_step_q4, filter_y, y_step_q4,
                            w, intermediate_height);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
                           dst, dst_stride,
                           filter_x, x_step_q4, filter_y, y_step_q4,
                           w, h);
 }
 
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h) {
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
   int intermediate_height = h + 7;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16) {
-    vp9_convolve8_avg_c(src, src_stride,
-                        dst, dst_stride,
-                        filter_x, x_step_q4,
-                        filter_y, y_step_q4,
-                        w, h);
-    return;
-  }
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
 
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.
    */
-  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
                            temp, 64,
                            filter_x, x_step_q4, filter_y, y_step_q4,
                            w, intermediate_height);
-  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
                               64, dst, dst_stride,
                               filter_x, x_step_q4, filter_y, y_step_q4,
                               w, h);

diff --git a/libvpx/vpx_dsp/bitreader.c b/libvpx/vpx_dsp/bitreader.c
new file mode 100644
index 0000000..4420fad
--- /dev/null
+++ b/libvpx/vpx_dsp/bitreader.c

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/prob.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/endian_inl.h"
+
+int vpx_reader_init(vpx_reader *r,
+                    const uint8_t *buffer,
+                    size_t size,
+                    vpx_decrypt_cb decrypt_cb,
+                    void *decrypt_state) {
+  if (size && !buffer) {
+    return 1;
+  } else {
+    r->buffer_end = buffer + size;
+    r->buffer = buffer;
+    r->value = 0;
+    r->count = -8;
+    r->range = 255;
+    r->decrypt_cb = decrypt_cb;
+    r->decrypt_state = decrypt_state;
+    vpx_reader_fill(r);
+    return vpx_read_bit(r) != 0;  // marker bit
+  }
+}
+
+void vpx_reader_fill(vpx_reader *r) {
+  const uint8_t *const buffer_end = r->buffer_end;
+  const uint8_t *buffer = r->buffer;
+  const uint8_t *buffer_start = buffer;
+  BD_VALUE value = r->value;
+  int count = r->count;
+  const size_t bytes_left = buffer_end - buffer;
+  const size_t bits_left = bytes_left * CHAR_BIT;
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+
+  if (r->decrypt_cb) {
+    size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
+    r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
+    buffer = r->clear_buffer;
+    buffer_start = r->clear_buffer;
+  }
+  if (bits_left > BD_VALUE_SIZE) {
+      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+      BD_VALUE nv;
+      BD_VALUE big_endian_values;
+      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+        big_endian_values = HToBE64(big_endian_values);
+#else
+        big_endian_values = HToBE32(big_endian_values);
+#endif
+      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+      count += bits;
+      buffer += (bits >> 3);
+      value = r->value | (nv << (shift & 0x7));
+  } else {
+    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+    if (bits_over >= 0) {
+      count += LOTS_OF_BITS;
+      loop_end = bits_over;
+    }
+
+    if (bits_over < 0 || bits_left) {
+      while (shift >= loop_end) {
+        count += CHAR_BIT;
+        value |= (BD_VALUE)*buffer++ << shift;
+        shift -= CHAR_BIT;
+      }
+    }
+  }
+
+  // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
+  // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
+  // assign 'buffer' to 'r->buffer'.
+  r->buffer += buffer - buffer_start;
+  r->value = value;
+  r->count = count;
+}
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r) {
+  // Find the end of the coded buffer
+  while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
+    r->count -= CHAR_BIT;
+    r->buffer--;
+  }
+  return r->buffer;
+}

diff --git a/libvpx/vpx_dsp/bitreader.h b/libvpx/vpx_dsp/bitreader.h
new file mode 100644
index 0000000..e817c8b
--- /dev/null
+++ b/libvpx/vpx_dsp/bitreader.h

@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_H_
+#define VPX_DSP_BITREADER_H_
+
+#include <stddef.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define LOTS_OF_BITS 0x40000000
+
+typedef struct {
+  // Be careful when reordering this struct, it may impact the cache negatively.
+  BD_VALUE value;
+  unsigned int range;
+  int count;
+  const uint8_t *buffer_end;
+  const uint8_t *buffer;
+  vpx_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+} vpx_reader;
+
+int vpx_reader_init(vpx_reader *r,
+                    const uint8_t *buffer,
+                    size_t size,
+                    vpx_decrypt_cb decrypt_cb,
+                    void *decrypt_state);
+
+void vpx_reader_fill(vpx_reader *r);
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r);
+
+static INLINE int vpx_reader_has_error(vpx_reader *r) {
+  // Check if we have reached the end of the buffer.
+  //
+  // Variable 'count' stores the number of bits in the 'value' buffer, minus
+  // 8. The top byte is part of the algorithm, and the remainder is buffered
+  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+  // occupied, 8 for the algorithm and 8 in the buffer.
+  //
+  // When reading a byte from the user's buffer, count is filled with 8 and
+  // one byte is filled into the value buffer. When we reach the end of the
+  // data, count is additionally filled with LOTS_OF_BITS. So when
+  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
+  //
+  // 1 if we have tried to decode bits after the end of stream was encountered.
+  // 0 No error.
+  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
+}
+
+static INLINE int vpx_read(vpx_reader *r, int prob) {
+  unsigned int bit = 0;
+  BD_VALUE value;
+  BD_VALUE bigsplit;
+  int count;
+  unsigned int range;
+  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
+
+  if (r->count < 0)
+    vpx_reader_fill(r);
+
+  value = r->value;
+  count = r->count;
+
+  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+  range = split;
+
+  if (value >= bigsplit) {
+    range = r->range - split;
+    value = value - bigsplit;
+    bit = 1;
+  }
+
+  {
+    register unsigned int shift = vpx_norm[range];
+    range <<= shift;
+    value <<= shift;
+    count -= shift;
+  }
+  r->value = value;
+  r->count = count;
+  r->range = range;
+
+  return bit;
+}
+
+static INLINE int vpx_read_bit(vpx_reader *r) {
+  return vpx_read(r, 128);  // vpx_prob_half
+}
+
+static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    literal |= vpx_read_bit(r) << bit;
+
+  return literal;
+}
+
+static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
+                                const vpx_prob *probs) {
+  vpx_tree_index i = 0;
+
+  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
+    continue;
+
+  return -i;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_H_

diff --git a/libvpx/vpx_dsp/bitreader_buffer.c b/libvpx/vpx_dsp/bitreader_buffer.c
new file mode 100644
index 0000000..fb04ee6
--- /dev/null
+++ b/libvpx/vpx_dsp/bitreader_buffer.c

@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./bitreader_buffer.h"
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
+  const size_t off = rb->bit_offset;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= vpx_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
+                               int bits) {
+  const int value = vpx_rb_read_literal(rb, bits);
+  return vpx_rb_read_bit(rb) ? -value : value;
+}

diff --git a/libvpx/vpx_dsp/bitreader_buffer.h b/libvpx/vpx_dsp/bitreader_buffer.h
new file mode 100644
index 0000000..03b156b
--- /dev/null
+++ b/libvpx/vpx_dsp/bitreader_buffer.h

@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*vpx_rb_error_handler)(void *data);
+
+struct vpx_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vpx_rb_error_handler error_handler;
+};
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_BUFFER_H_

diff --git a/libvpx/vp9/encoder/vp9_writer.c b/libvpx/vpx_dsp/bitwriter.c
similarity index 76%
rename from libvpx/vp9/encoder/vp9_writer.c
rename to libvpx/vpx_dsp/bitwriter.c
index ff461f2..5b232e3 100644
--- a/libvpx/vp9/encoder/vp9_writer.c
+++ b/libvpx/vpx_dsp/bitwriter.c

@@ -9,23 +9,23 @@
  */
 
 #include <assert.h>
-#include "vp9/encoder/vp9_writer.h"
-#include "vp9/common/vp9_entropy.h"
 
-void vp9_start_encode(vp9_writer *br, uint8_t *source) {
+#include "./bitwriter.h"
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
-  vp9_write_bit(br, 0);
+  vpx_write_bit(br, 0);
 }
 
-void vp9_stop_encode(vp9_writer *br) {
+void vpx_stop_encode(vpx_writer *br) {
   int i;
 
   for (i = 0; i < 32; i++)
-    vp9_write_bit(br, 0);
+    vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)

diff --git a/libvpx/vp9/encoder/vp9_writer.h b/libvpx/vpx_dsp/bitwriter.h
similarity index 70%
rename from libvpx/vp9/encoder/vp9_writer.h
rename to libvpx/vpx_dsp/bitwriter.h
index 9d161f9..f6ca9b9 100644
--- a/libvpx/vp9/encoder/vp9_writer.h
+++ b/libvpx/vpx_dsp/bitwriter.h

@@ -8,29 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITER_H_
-#define VP9_ENCODER_VP9_WRITER_H_
+#ifndef VPX_DSP_BITWRITER_H_
+#define VPX_DSP_BITWRITER_H_
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct {
+typedef struct vpx_writer {
   unsigned int lowvalue;
   unsigned int range;
   int count;
   unsigned int pos;
   uint8_t *buffer;
-} vp9_writer;
+} vpx_writer;
 
-void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
-void vp9_stop_encode(vp9_writer *bc);
+void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
+void vpx_stop_encode(vpx_writer *bc);
 
-static INLINE void vp9_write(vp9_writer *br, int bit, int probability) {
+static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -46,7 +46,7 @@
     range = br->range - split;
   }
 
-  shift = vp9_norm[range];
+  shift = vpx_norm[range];
 
   range <<= shift;
   count += shift;
@@ -78,21 +78,21 @@
   br->range = range;
 }
 
-static INLINE void vp9_write_bit(vp9_writer *w, int bit) {
-  vp9_write(w, bit, 128);  // vp9_prob_half
+static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
+  vpx_write(w, bit, 128);  // vpx_prob_half
 }
 
-static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) {
+static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_write_bit(w, 1 & (data >> bit));
+    vpx_write_bit(w, 1 & (data >> bit));
 }
 
-#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
+#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITER_H_
+#endif  // VPX_DSP_BITWRITER_H_

diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.c b/libvpx/vpx_dsp/bitwriter_buffer.c
similarity index 75%
rename from libvpx/vp9/encoder/vp9_write_bit_buffer.c
rename to libvpx/vpx_dsp/bitwriter_buffer.c
index 6d55e84..0dfb859 100644
--- a/libvpx/vp9/encoder/vp9_write_bit_buffer.c
+++ b/libvpx/vpx_dsp/bitwriter_buffer.c

@@ -9,13 +9,14 @@
  */
 
 #include <limits.h>
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
+#include "./bitwriter_buffer.h"
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
   const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
@@ -28,8 +29,8 @@
   wb->bit_offset = off + 1;
 }
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
   int bit;
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
+    vpx_wb_write_bit(wb, (data >> bit) & 1);
 }

diff --git a/libvpx/vp9/encoder/vp9_write_bit_buffer.h b/libvpx/vpx_dsp/bitwriter_buffer.h
similarity index 62%
rename from libvpx/vp9/encoder/vp9_write_bit_buffer.h
rename to libvpx/vpx_dsp/bitwriter_buffer.h
index 59f9bbe..9397668 100644
--- a/libvpx/vp9/encoder/vp9_write_bit_buffer.h
+++ b/libvpx/vpx_dsp/bitwriter_buffer.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
-#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#ifndef VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -17,20 +17,20 @@
 extern "C" {
 #endif
 
-struct vp9_write_bit_buffer {
+struct vpx_write_bit_buffer {
   uint8_t *bit_buffer;
   size_t bit_offset;
 };
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb);
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
 
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#endif  // VPX_DSP_BITWRITER_BUFFER_H_

diff --git a/libvpx/vpx_dsp/fastssim.c b/libvpx/vpx_dsp/fastssim.c
new file mode 100644
index 0000000..1405a30
--- /dev/null
+++ b/libvpx/vpx_dsp/fastssim.c

@@ -0,0 +1,468 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+/* TODO(jbb): High bit depth version of this code needed */
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size = _nlevels * sizeof(fs_level)
+      + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *) malloc(data_size);
+  _ctx->level = (fs_level *) data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint16_t *) data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *) data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *) data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) {
+  free(_ctx->level);
+}
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint16_t *src1;
+  const uint16_t *src2;
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
+          + src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
+          + src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const unsigned char *_src1,
+                                 int _s1ystride, const unsigned char *_src2,
+                                 int _s2ystride, int _w, int _h) {
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
+          + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
+          + _src1[j1 * _s1ystride + i1];
+      dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
+          + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
+          + _src2[j1 * _s2ystride + i1];
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++)
+    col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++)
+    col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++)
+      col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++)
+      col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double) (SSIM_C1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double) muy + c1)
+          / (mux * (double) mux + muy * (double) muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx; \
+    col_sums_gy2[(_col)] = gy * (double)gy; \
+    col_sums_gxgy[(_col)] = gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx; \
+    col_sums_gy2[(_col)] += gy * (double)gy; \
+    col_sums_gxgy[(_col)] += gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx; \
+    col_sums_gy2[(_col)] -= gy * (double)gy; \
+    col_sums_gxgy[(_col)] -= gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } \
+  while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } \
+  while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } \
+  while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l) {
+  uint16_t *im1;
+  uint16_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = SSIM_C2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs(im1[(j + 1) * w + i + 1] - im1[j * w + i]);
+        g2 = abs(im1[(j + 1) * w + i] - im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs(im2[(j + 1) * w + i + 1] - im2[j * w + i]);
+        g2 = abs(im2[(j + 1) * w + i] - im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++)
+          mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++)
+          mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++)
+          mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
+    0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++)
+      ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double calc_ssim(const unsigned char *_src, int _systride,
+                 const unsigned char *_dst, int _dystride, int _w, int _h) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l);
+  fs_apply_luminance(&ctx, l);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssimv;
+  vpx_clear_system_state();
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height);
+
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+
+  return convert_ssim_db(ssimv, 1.0);
+}

diff --git a/libvpx/vpx_dsp/fwd_txfm.c b/libvpx/vpx_dsp/fwd_txfm.c
new file mode 100644
index 0000000..7baaa8b
--- /dev/null
+++ b/libvpx/vpx_dsp/fwd_txfm.c

@@ -0,0 +1,822 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/fwd_txfm.h"
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
+  }
+}
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in  = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
+  }
+}
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+  return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
+  // Stage 1
+  step[0] = input[0] + input[(32 - 1)];
+  step[1] = input[1] + input[(32 - 2)];
+  step[2] = input[2] + input[(32 - 3)];
+  step[3] = input[3] + input[(32 - 4)];
+  step[4] = input[4] + input[(32 - 5)];
+  step[5] = input[5] + input[(32 - 6)];
+  step[6] = input[6] + input[(32 - 7)];
+  step[7] = input[7] + input[(32 - 8)];
+  step[8] = input[8] + input[(32 - 9)];
+  step[9] = input[9] + input[(32 - 10)];
+  step[10] = input[10] + input[(32 - 11)];
+  step[11] = input[11] + input[(32 - 12)];
+  step[12] = input[12] + input[(32 - 13)];
+  step[13] = input[13] + input[(32 - 14)];
+  step[14] = input[14] + input[(32 - 15)];
+  step[15] = input[15] + input[(32 - 16)];
+  step[16] = -input[16] + input[(32 - 17)];
+  step[17] = -input[17] + input[(32 - 18)];
+  step[18] = -input[18] + input[(32 - 19)];
+  step[19] = -input[19] + input[(32 - 20)];
+  step[20] = -input[20] + input[(32 - 21)];
+  step[21] = -input[21] + input[(32 - 22)];
+  step[22] = -input[22] + input[(32 - 23)];
+  step[23] = -input[23] + input[(32 - 24)];
+  step[24] = -input[24] + input[(32 - 25)];
+  step[25] = -input[25] + input[(32 - 26)];
+  step[26] = -input[26] + input[(32 - 27)];
+  step[27] = -input[27] + input[(32 - 28)];
+  step[28] = -input[28] + input[(32 - 29)];
+  step[29] = -input[29] + input[(32 - 30)];
+  step[30] = -input[30] + input[(32 - 31)];
+  step[31] = -input[31] + input[(32 - 32)];
+
+  // Stage 2
+  output[0] = step[0] + step[16 - 1];
+  output[1] = step[1] + step[16 - 2];
+  output[2] = step[2] + step[16 - 3];
+  output[3] = step[3] + step[16 - 4];
+  output[4] = step[4] + step[16 - 5];
+  output[5] = step[5] + step[16 - 6];
+  output[6] = step[6] + step[16 - 7];
+  output[7] = step[7] + step[16 - 8];
+  output[8] = -step[8] + step[16 - 9];
+  output[9] = -step[9] + step[16 - 10];
+  output[10] = -step[10] + step[16 - 11];
+  output[11] = -step[11] + step[16 - 12];
+  output[12] = -step[12] + step[16 - 13];
+  output[13] = -step[13] + step[16 - 14];
+  output[14] = -step[14] + step[16 - 15];
+  output[15] = -step[15] + step[16 - 16];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = step[18];
+  output[19] = step[19];
+
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+  output[28] = step[28];
+  output[29] = step[29];
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
+  // Stage 3
+  step[0] = output[0] + output[(8 - 1)];
+  step[1] = output[1] + output[(8 - 2)];
+  step[2] = output[2] + output[(8 - 3)];
+  step[3] = output[3] + output[(8 - 4)];
+  step[4] = -output[4] + output[(8 - 5)];
+  step[5] = -output[5] + output[(8 - 6)];
+  step[6] = -output[6] + output[(8 - 7)];
+  step[7] = -output[7] + output[(8 - 8)];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  step[16] = output[16] + output[23];
+  step[17] = output[17] + output[22];
+  step[18] = output[18] + output[21];
+  step[19] = output[19] + output[20];
+  step[20] = -output[20] + output[19];
+  step[21] = -output[21] + output[18];
+  step[22] = -output[22] + output[17];
+  step[23] = -output[23] + output[16];
+  step[24] = -output[24] + output[31];
+  step[25] = -output[25] + output[30];
+  step[26] = -output[26] + output[29];
+  step[27] = -output[27] + output[28];
+  step[28] = output[28] + output[27];
+  step[29] = output[29] + output[26];
+  step[30] = output[30] + output[25];
+  step[31] = output[31] + output[24];
+
+  // Stage 4
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = -step[2] + step[1];
+  output[3] = -step[3] + step[0];
+  output[4] = step[4];
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = -step[10] + step[9];
+  output[11] = -step[11] + step[8];
+  output[12] = -step[12] + step[15];
+  output[13] = -step[13] + step[14];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+  output[22] = step[22];
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = step[25];
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // Stage 5
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+  step[15] = output[15];
+
+  step[16] = output[16] + output[19];
+  step[17] = output[17] + output[18];
+  step[18] = -output[18] + output[17];
+  step[19] = -output[19] + output[16];
+  step[20] = -output[20] + output[23];
+  step[21] = -output[21] + output[22];
+  step[22] = output[22] + output[21];
+  step[23] = output[23] + output[20];
+  step[24] = output[24] + output[27];
+  step[25] = output[25] + output[26];
+  step[26] = -output[26] + output[25];
+  step[27] = -output[27] + output[24];
+  step[28] = -output[28] + output[31];
+  step[29] = -output[29] + output[30];
+  step[30] = output[30] + output[29];
+  step[31] = output[31] + output[28];
+
+  // Stage 6
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+  output[8] = step[8] + step[9];
+  output[9] = -step[9] + step[8];
+  output[10] = -step[10] + step[11];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = -step[13] + step[12];
+  output[14] = -step[14] + step[15];
+  output[15] = step[15] + step[14];
+
+  output[16] = step[16];
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+  output[19] = step[19];
+  output[20] = step[20];
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+  output[27] = step[27];
+  output[28] = step[28];
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+  output[31] = step[31];
+
+  // Stage 7
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+  step[16] = output[16] + output[17];
+  step[17] = -output[17] + output[16];
+  step[18] = -output[18] + output[19];
+  step[19] = output[19] + output[18];
+  step[20] = output[20] + output[21];
+  step[21] = -output[21] + output[20];
+  step[22] = -output[22] + output[23];
+  step[23] = output[23] + output[22];
+  step[24] = output[24] + output[25];
+  step[25] = -output[25] + output[24];
+  step[26] = -output[26] + output[27];
+  step[27] = output[27] + output[26];
+  step[28] = output[28] + output[29];
+  step[29] = -output[29] + output[28];
+  step[30] = -output[30] + output[31];
+  step[31] = output[31] + output[30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      // TODO(cd): see quality impact of only doing
+      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+      //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vpx_fdct32(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (tran_low_t)temp_out[j];
+  }
+}
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 32; ++r)
+    for (c = 0; c < 32; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 3;
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vpx_fdct4x4_c(input, output, stride);
+}
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vpx_fdct8x8_c(input, final_output, stride);
+}
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                            int stride) {
+  vpx_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct16x16_c(input, output, stride);
+}
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                              int stride) {
+  vpx_fdct16x16_1_c(input, output, stride);
+}
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  vpx_fdct32x32_c(input, out, stride);
+}
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride) {
+  vpx_fdct32x32_rd_c(input, out, stride);
+}
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+                              int stride) {
+  vpx_fdct32x32_1_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/fwd_txfm.h b/libvpx/vpx_dsp/fwd_txfm.h
new file mode 100644
index 0000000..29e139c
--- /dev/null
+++ b/libvpx/vpx_dsp/fwd_txfm.h

@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_FWD_TXFM_H_
+#define VPX_DSP_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert
+  // and make the bounds consts.
+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif  // VPX_DSP_FWD_TXFM_H_

diff --git a/libvpx/vpx_dsp/intrapred.c b/libvpx/vpx_dsp/intrapred.c
new file mode 100644
index 0000000..9ba0f64
--- /dev/null
+++ b/libvpx/vpx_dsp/intrapred.c

@@ -0,0 +1,692 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define DST(x, y) dst[(x) + (y) * stride]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) above;
+  // first column
+  for (r = 0; r < bs - 1; ++r)
+    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // second column
+  for (r = 0; r < bs - 2; ++r)
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // rest of last row
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+}
+
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  int size;
+  (void)left;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
+    memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+    memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+  }
+}
+
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8_t above_right = above[bs - 1];
+  const uint8_t *const dst_row0 = dst;
+  int x, size;
+  (void)left;
+
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size);
+    memset(dst + size, above_right, x + 1);
+    dst += stride;
+  }
+}
+
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+
+  // first row
+  for (c = 0; c < bs; c++)
+    dst[c] = AVG2(above[c - 1], above[c]);
+  dst += stride;
+
+  // second row
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  dst += stride;
+
+  // the rest of first col
+  dst[0] = AVG3(above[-1], left[0], left[1]);
+  for (r = 3; r < bs; ++r)
+    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+  // the rest of the block
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; ++r)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+
+  dst += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  dst[0] = AVG2(above[-1], left[0]);
+  for (r = 1; r < bs; r++)
+    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  dst++;
+
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; r++)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  dst++;
+
+  for (c = 0; c < bs - 2; c++)
+    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+  dst += stride;
+
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
+  }
+}
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) above;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  int ytop_left = above[-1];
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) above;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, 128, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void) above;
+
+  for (i = 0; i < bs; i++)
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void) left;
+
+  for (i = 0; i < bs; i++)
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  const int count = 2 * bs;
+
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  (void)above;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  (void)left;
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+              DST(3, 2) = AVG2(E, F);  // differs from vp8
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 3) = AVG3(E, F, G);  // differs from vp8
+}
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)stride;
+  (void)left;
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = H;  // differs from vp8
+}
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)stride;
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) above;
+  (void) bd;
+
+  // First column.
+  for (r = 0; r < bs - 1; ++r) {
+    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  }
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Second column.
+  for (r = 0; r < bs - 2; ++r) {
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  }
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Rest of last row.
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r) {
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+  }
+}
+
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+                            above[(r >> 1) + c + 2])
+          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
+                                         above[r + c + 2])
+          : above[bs * 2 - 1];
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+
+  // first row
+  for (c = 0; c < bs; c++)
+    dst[c] = AVG2(above[c - 1], above[c]);
+  dst += stride;
+
+  // second row
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  dst += stride;
+
+  // the rest of first col
+  dst[0] = AVG3(above[-1], left[0], left[1]);
+  for (r = 3; r < bs; ++r)
+    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+  // the rest of the block
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; ++r)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+
+  dst += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+  dst[0] = AVG2(above[-1], left[0]);
+  for (r = 1; r < bs; r++)
+    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  dst++;
+
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; r++)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  dst++;
+
+  for (c = 0; c < bs - 2; c++)
+    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+  dst += stride;
+
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int r, c;
+  int ytop_left = above[-1];
+  (void) bd;
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, 128 << (bd - 8), bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void) above;
+  (void) bd;
+
+  for (i = 0; i < bs; i++)
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void) left;
+  (void) bd;
+
+  for (i = 0; i < bs; i++)
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  const int count = 2 * bs;
+  (void) bd;
+
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, size) \
+  void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  const uint8_t *above, \
+                                                  const uint8_t *left) { \
+    type##_predictor(dst, stride, size, above, left); \
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_highbd_sized(type, size) \
+  void vpx_highbd_##type##_predictor_##size##x##size##_c( \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+      const uint16_t *left, int bd) { \
+    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
+  }
+
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
+
+#else
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+intra_pred_no_4x4(d207)
+intra_pred_no_4x4(d63)
+intra_pred_no_4x4(d45)
+intra_pred_no_4x4(d117)
+intra_pred_no_4x4(d135)
+intra_pred_no_4x4(d153)
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(tm)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_allsizes(dc)
+#undef intra_pred_allsizes

diff --git a/libvpx/vpx_dsp/inv_txfm.c b/libvpx/vpx_dsp/inv_txfm.c
new file mode 100644
index 0000000..3afa8cd
--- /dev/null
+++ b/libvpx/vpx_dsp/inv_txfm.c

@@ -0,0 +1,2477 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "vpx_dsp/inv_txfm.h"
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+   0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, 8);
+    op[1] = WRAPLOW(b1, 8);
+    op[2] = WRAPLOW(c1, 8);
+    op[3] = WRAPLOW(d1, 8);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, 8);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+    ip++;
+    dest++;
+  }
+}
+
+void idct4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], 8);
+  output[1] = WRAPLOW(step[1] + step[2], 8);
+  output[2] = WRAPLOW(step[1] - step[2], 8);
+  output[3] = WRAPLOW(step[0] - step[3], 8);
+}
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    idct4_c(input, outptr);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    idct4_c(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+                         int dest_stride) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_add(dest[0], a1);
+    dest[1] = clip_pixel_add(dest[1], a1);
+    dest[2] = clip_pixel_add(dest[2], a1);
+    dest[3] = clip_pixel_add(dest[3], a1);
+    dest += dest_stride;
+  }
+}
+
+void idct8_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2 & stage 3 - even half
+  idct4_c(step1, step1);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  // stage 3 -odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], 8);
+  output[1] = WRAPLOW(step1[1] + step1[6], 8);
+  output[2] = WRAPLOW(step1[2] + step1[5], 8);
+  output[3] = WRAPLOW(step1[3] + step1[4], 8);
+  output[4] = WRAPLOW(step1[3] - step1[4], 8);
+  output[5] = WRAPLOW(step1[2] - step1[5], 8);
+  output[6] = WRAPLOW(step1[1] - step1[6], 8);
+  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+}
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  for (i = 0; i < 8; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
+  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+}
+
+void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
+  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+
+  // stage 2
+  s0 = (int)x0;
+  s1 = (int)x1;
+  s2 = (int)x2;
+  s3 = (int)x3;
+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+  x0 = WRAPLOW(s0 + s2, 8);
+  x1 = WRAPLOW(s1 + s3, 8);
+  x2 = WRAPLOW(s0 - s2, 8);
+  x3 = WRAPLOW(s1 - s3, 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+
+  // stage 3
+  s2 = (int)(cospi_16_64 * (x2 + x3));
+  s3 = (int)(cospi_16_64 * (x2 - x3));
+  s6 = (int)(cospi_16_64 * (x6 + x7));
+  s7 = (int)(cospi_16_64 * (x6 - x7));
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x4, 8);
+  output[2] = WRAPLOW(x6, 8);
+  output[3] = WRAPLOW(-x2, 8);
+  output[4] = WRAPLOW(x3, 8);
+  output[5] = WRAPLOW(-x7, 8);
+  output[6] = WRAPLOW(x5, 8);
+  output[7] = WRAPLOW(-x1, 8);
+}
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  // only first 4 row has non-zero coefs
+  for (i = 0; i < 4; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void idct16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], 8);
+  output[1] = WRAPLOW(step2[1] + step2[14], 8);
+  output[2] = WRAPLOW(step2[2] + step2[13], 8);
+  output[3] = WRAPLOW(step2[3] + step2[12], 8);
+  output[4] = WRAPLOW(step2[4] + step2[11], 8);
+  output[5] = WRAPLOW(step2[5] + step2[10], 8);
+  output[6] = WRAPLOW(step2[6] + step2[9], 8);
+  output[7] = WRAPLOW(step2[7] + step2[8], 8);
+  output[8] = WRAPLOW(step2[7] - step2[8], 8);
+  output[9] = WRAPLOW(step2[6] - step2[9], 8);
+  output[10] = WRAPLOW(step2[5] - step2[10], 8);
+  output[11] = WRAPLOW(step2[4] - step2[11], 8);
+  output[12] = WRAPLOW(step2[3] - step2[12], 8);
+  output[13] = WRAPLOW(step2[2] - step2[13], 8);
+  output[14] = WRAPLOW(step2[1] - step2[14], 8);
+  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+}
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, 8);
+  x1 = WRAPLOW(s1 + s5, 8);
+  x2 = WRAPLOW(s2 + s6, 8);
+  x3 = WRAPLOW(s3 + s7, 8);
+  x4 = WRAPLOW(s0 - s4, 8);
+  x5 = WRAPLOW(s1 - s5, 8);
+  x6 = WRAPLOW(s2 - s6, 8);
+  x7 = WRAPLOW(s3 - s7, 8);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(check_range(s0 + s2), 8);
+  x1 = WRAPLOW(check_range(s1 + s3), 8);
+  x2 = WRAPLOW(check_range(s0 - s2), 8);
+  x3 = WRAPLOW(check_range(s1 - s3), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x8 = WRAPLOW(check_range(s8 + s10), 8);
+  x9 = WRAPLOW(check_range(s9 + s11), 8);
+  x10 = WRAPLOW(check_range(s8 - s10), 8);
+  x11 = WRAPLOW(check_range(s9 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x8, 8);
+  output[2] = WRAPLOW(x12, 8);
+  output[3] = WRAPLOW(-x4, 8);
+  output[4] = WRAPLOW(x6, 8);
+  output[5] = WRAPLOW(x14, 8);
+  output[6] = WRAPLOW(x10, 8);
+  output[7] = WRAPLOW(x2, 8);
+  output[8] = WRAPLOW(x3, 8);
+  output[9] = WRAPLOW(x11, 8);
+  output[10] = WRAPLOW(x15, 8);
+  output[11] = WRAPLOW(x7, 8);
+  output[12] = WRAPLOW(x5, 8);
+  output[13] = WRAPLOW(-x13, 8);
+  output[14] = WRAPLOW(x9, 8);
+  output[15] = WRAPLOW(-x1, 8);
+}
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void idct32_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], 8);
+  output[1] = WRAPLOW(step1[1] + step1[30], 8);
+  output[2] = WRAPLOW(step1[2] + step1[29], 8);
+  output[3] = WRAPLOW(step1[3] + step1[28], 8);
+  output[4] = WRAPLOW(step1[4] + step1[27], 8);
+  output[5] = WRAPLOW(step1[5] + step1[26], 8);
+  output[6] = WRAPLOW(step1[6] + step1[25], 8);
+  output[7] = WRAPLOW(step1[7] + step1[24], 8);
+  output[8] = WRAPLOW(step1[8] + step1[23], 8);
+  output[9] = WRAPLOW(step1[9] + step1[22], 8);
+  output[10] = WRAPLOW(step1[10] + step1[21], 8);
+  output[11] = WRAPLOW(step1[11] + step1[20], 8);
+  output[12] = WRAPLOW(step1[12] + step1[19], 8);
+  output[13] = WRAPLOW(step1[13] + step1[18], 8);
+  output[14] = WRAPLOW(step1[14] + step1[17], 8);
+  output[15] = WRAPLOW(step1[15] + step1[16], 8);
+  output[16] = WRAPLOW(step1[15] - step1[16], 8);
+  output[17] = WRAPLOW(step1[14] - step1[17], 8);
+  output[18] = WRAPLOW(step1[13] - step1[18], 8);
+  output[19] = WRAPLOW(step1[12] - step1[19], 8);
+  output[20] = WRAPLOW(step1[11] - step1[20], 8);
+  output[21] = WRAPLOW(step1[10] - step1[21], 8);
+  output[22] = WRAPLOW(step1[9] - step1[22], 8);
+  output[23] = WRAPLOW(step1[8] - step1[23], 8);
+  output[24] = WRAPLOW(step1[7] - step1[24], 8);
+  output[25] = WRAPLOW(step1[6] - step1[25], 8);
+  output[26] = WRAPLOW(step1[5] - step1[26], 8);
+  output[27] = WRAPLOW(step1[4] - step1[27], 8);
+  output[28] = WRAPLOW(step1[3] - step1[28], 8);
+  output[29] = WRAPLOW(step1[2] - step1[29], 8);
+  output[30] = WRAPLOW(step1[1] - step1[30], 8);
+  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+}
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      idct32_c(input, outptr);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, bd);
+    op[1] = WRAPLOW(b1, bd);
+    op[2] = WRAPLOW(c1, bd);
+    op[3] = WRAPLOW(d1, bd);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void) bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = highbd_clip_pixel_add(
+        dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = highbd_clip_pixel_add(
+        dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = highbd_clip_pixel_add(
+        dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = highbd_clip_pixel_add(
+        dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(step[1] + step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct4_c(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    vpx_highbd_idct4_c(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+    dest += dest_stride;
+  }
+}
+
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2 & stage 3 - even half
+  vpx_highbd_idct4_c(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 8; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3)) {
+    memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = (tran_high_t)(x0 - x2 + x3);
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+}
+
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x4, bd);
+  output[2] = WRAPLOW(x6, bd);
+  output[3] = WRAPLOW(-x2, bd);
+  output[4] = WRAPLOW(x3, bd);
+  output[5] = WRAPLOW(-x7, bd);
+  output[6] = WRAPLOW(x5, bd);
+  output[7] = WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  // Only first 4 row has non-zero coefs.
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 16; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, bd);
+  x1 = WRAPLOW(s1 + s5, bd);
+  x2 = WRAPLOW(s2 + s6, bd);
+  x3 = WRAPLOW(s3 + s7, bd);
+  x4 = WRAPLOW(s0 - s4, bd);
+  x5 = WRAPLOW(s1 - s5, bd);
+  x6 = WRAPLOW(s2 - s6, bd);
+  x7 = WRAPLOW(s3 - s7, bd);
+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x8 = WRAPLOW(s8 + s10, bd);
+  x9 = WRAPLOW(s9 + s11, bd);
+  x10 = WRAPLOW(s8 - s10, bd);
+  x11 = WRAPLOW(s9 - s11, bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x8, bd);
+  output[2] = WRAPLOW(x12, bd);
+  output[3] = WRAPLOW(-x4, bd);
+  output[4] = WRAPLOW(x6, bd);
+  output[5] = WRAPLOW(x14, bd);
+  output[6] = WRAPLOW(x10, bd);
+  output[7] = WRAPLOW(x2, bd);
+  output[8] = WRAPLOW(x3, bd);
+  output[9] = WRAPLOW(x11, bd);
+  output[10] = WRAPLOW(x15, bd);
+  output[11] = WRAPLOW(x7, bd);
+  output[12] = WRAPLOW(x5, bd);
+  output[13] = WRAPLOW(-x13, bd);
+  output[14] = WRAPLOW(x9, bd);
+  output[15] = WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void highbd_idct32_c(const tran_low_t *input,
+                            tran_low_t *output, int bd) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_low_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      highbd_idct32_c(input, outptr, bd);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff.
+  for (i = 0; i < 8; ++i) {
+    highbd_idct32_c(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  int a1;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/inv_txfm.h b/libvpx/vpx_dsp/inv_txfm.h
new file mode 100644
index 0000000..2358813
--- /dev/null
+++ b/libvpx/vpx_dsp/inv_txfm.h

@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid VP9 input streams, intermediate stage coefficients should always
+  // stay within the range of a signed 16 bit integer. Coefficients can go out
+  // of this range for invalid/corrupt VP9 streams. However, strictly checking
+  // this range for every intermediate coefficient can burdensome for a decoder,
+  // therefore the following assertion is only enabled when configured with
+  // --enable-coefficient-range-checking.
+  assert(INT16_MIN <= input);
+  assert(input <= INT16_MAX);
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif  // CONFIG_EMULATE_HARDWARE
+
+void idct4_c(const tran_low_t *input, tran_low_t *output);
+void idct8_c(const tran_low_t *input, tran_low_t *output);
+void idct16_c(const tran_low_t *input, tran_low_t *output);
+void idct32_c(const tran_low_t *input, tran_low_t *output);
+void iadst4_c(const tran_low_t *input, tran_low_t *output);
+void iadst8_c(const tran_low_t *input, tran_low_t *output);
+void iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_INV_TXFM_H_

diff --git a/libvpx/vpx_dsp/loopfilter.c b/libvpx/vpx_dsp/loopfilter.c
new file mode 100644
index 0000000..66f4d95
--- /dev/null
+++ b/libvpx/vpx_dsp/loopfilter.c

@@ -0,0 +1,745 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+  return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10:
+      return (int16_t)clamp(t, -128*4, 128*4-1);
+    case 12:
+      return (int16_t)clamp(t, -128*16, 128*16-1);
+    case 8:
+    default:
+      return (int16_t)clamp(t, -128, 128-1);
+  }
+}
+#endif
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
+                                 uint8_t p3, uint8_t p2,
+                                 uint8_t p1, uint8_t p0,
+                                 uint8_t q0, uint8_t q1,
+                                 uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh,
+                                uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0,
+                                uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask5(uint8_t thresh,
+                                uint8_t p4, uint8_t p3,
+                                uint8_t p2, uint8_t p1,
+                                uint8_t p0, uint8_t q0,
+                                uint8_t q1, uint8_t q2,
+                                uint8_t q3, uint8_t q4) {
+  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+  mask |= (abs(p4 - p0) > thresh) * -1;
+  mask |= (abs(q4 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
+  hev  |= (abs(p1 - p0) > thresh) * -1;
+  hev  |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter1, filter2;
+
+  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
+  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
+  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
+  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
+  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+  // add outer taps if we have high edge variance
+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+  // inner taps
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+  // save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way
+  filter1 = signed_char_clamp(filter + 4) >> 3;
+  filter2 = signed_char_clamp(filter + 3) >> 3;
+
+  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
+  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+
+  // outer tap adjustments
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
+  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+}
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh, int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    ++s;
+  }
+}
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+    s += pitch;
+  }
+}
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                  thresh1, 1);
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                           uint8_t *op3, uint8_t *op2,
+                           uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
+  if (flat && mask) {
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    filter4(mask, thresh, op1,  op0, oq0, oq1);
+  }
+}
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh,
+                            int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    ++s;
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
+                                 s,     s + 1, s + 2, s + 3);
+    s += pitch;
+  }
+}
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                    thresh1, 1);
+}
+
+static INLINE void filter16(int8_t mask, uint8_t thresh,
+                            uint8_t flat, uint8_t flat2,
+                            uint8_t *op7, uint8_t *op6,
+                            uint8_t *op5, uint8_t *op4,
+                            uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1,
+                            uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5,
+                            uint8_t *oq6, uint8_t *oq7) {
+  if (flat2 && flat && mask) {
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1,
+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
+
+    filter16(mask, *thresh, flat, flat2,
+             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p,
+             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+    ++s;
+  }
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                    q0, s[4], s[5], s[6], s[7]);
+
+    filter16(mask, *thresh, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    s += p;
+  }
+}
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+}
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2,
+                                        uint16_t p1, uint16_t p0,
+                                        uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
+                                       uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0,
+                                       uint16_t q0, uint16_t q1,
+                                       uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
+                                       uint16_t p4, uint16_t p3,
+                                       uint16_t p2, uint16_t p1,
+                                       uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2,
+                                       uint16_t q3, uint16_t q4, int bd) {
+  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p4 - p0) > thresh16) * -1;
+  mask |= (abs(q4 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                                  uint16_t *op3, uint16_t *op2,
+                                  uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4, s - 3, s - 2, s - 1,
+                 s, s + 1, s + 2, s + 3,
+                 bd);
+    s += pitch;
+  }
+}
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
+                                   uint8_t flat, uint8_t flat2,
+                                   uint16_t *op7, uint16_t *op6,
+                                   uint16_t *op5, uint16_t *op4,
+                                   uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0,
+                                   uint16_t *oq0, uint16_t *oq1,
+                                   uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5,
+                                   uint16_t *oq6, uint16_t *oq7, int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p7 = *op7;
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+    const uint16_t q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                    s, s + 1 * p, s + 2 * p, s + 3 * p,
+                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
+                    bd);
+    ++s;
+  }
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int count, int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                           q0, s[4], s[5], s[6], s[7], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
+                    bd);
+    s += p;
+  }
+}
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
+                                       int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/mips/common_dspr2.c b/libvpx/vpx_dsp/mips/common_dspr2.c
new file mode 100644
index 0000000..b22f084
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/common_dspr2.c

@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vpx_ff_cropTbl;
+
+void vpx_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    vpx_ff_cropTbl_a[i] = 0;
+    vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif

diff --git a/libvpx/vpx_dsp/mips/common_dspr2.h b/libvpx/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000..7a10bf1
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/common_dspr2.h

@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_COMMON_MIPS_DSPR2_H_
+#define VPX_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *vpx_ff_cropTbl;  // From "vpx_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_COMMON_MIPS_DSPR2_H_

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
similarity index 86%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
index 91d62bc..3c76767 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
@@ -30,7 +28,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2;
   uint32_t      p1, p2;
@@ -44,7 +42,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -134,7 +132,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2;
   uint32_t      p1, p2;
@@ -148,8 +146,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -230,52 +228,46 @@
   }
 }
 
-void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
-  if (16 == y_step_q4) {
-    uint32_t pos = 38;
+  uint32_t pos = 38;
 
-    /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+  assert(y_step_q4 == 16);
 
-    vp9_prefetch_store(dst);
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
 
-    switch (w) {
-      case 4:
-      case 8:
-      case 16:
-      case 32:
-        convolve_bi_avg_vert_4_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_y, w, h);
-        break;
-      case 64:
-        vp9_prefetch_store(dst + 32);
-        convolve_bi_avg_vert_64_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_y, h);
-        break;
-      default:
-        vp9_convolve8_avg_vert_c(src, src_stride,
-                                 dst, dst_stride,
-                                 filter_x, x_step_q4,
-                                 filter_y, y_step_q4,
-                                 w, h);
-        break;
-    }
-  } else {
-    vp9_convolve8_avg_vert_c(src, src_stride,
-                             dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, h);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      convolve_bi_avg_vert_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_y, w, h);
+      break;
+    case 64:
+      prefetch_store(dst + 32);
+      convolve_bi_avg_vert_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_y, h);
+      break;
+    default:
+      vpx_convolve8_avg_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+      break;
   }
 }
 #endif

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
similarity index 93%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index 148b20f..932a73d 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
@@ -27,7 +25,7 @@
                                           const int16_t *filter_x0,
                                           int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   int32_t  Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
   uint32_t tp1, tp2;
@@ -40,9 +38,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -122,7 +120,7 @@
                                          const int16_t *filter_x0,
                                          int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t tp1, tp2, tp3, tp4;
@@ -135,9 +133,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -274,7 +272,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2, qload3;
@@ -290,9 +288,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -523,7 +521,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2, qload3;
@@ -539,11 +537,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -765,69 +763,63 @@
   }
 }
 
-void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h) {
-  if (16 == x_step_q4) {
-    uint32_t pos = 38;
+  uint32_t pos = 38;
 
-    /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+  assert(x_step_q4 == 16);
 
-    /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
 
-    switch (w) {
-      case 4:
-        convolve_bi_avg_horiz_4_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_x, h);
-        break;
-      case 8:
-        convolve_bi_avg_horiz_8_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_x, h);
-        break;
-      case 16:
-        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h, 1);
-        break;
-      case 32:
-        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h, 2);
-        break;
-      case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
-        convolve_bi_avg_horiz_64_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h);
-        break;
-      default:
-        vp9_convolve8_avg_horiz_c(src, src_stride,
-                                  dst, dst_stride,
-                                  filter_x, x_step_q4,
-                                  filter_y, y_step_q4,
-                                  w, h);
-        break;
-    }
-  } else {
-    vp9_convolve8_avg_horiz_c(src, src_stride,
-                              dst, dst_stride,
-                              filter_x, x_step_q4,
-                              filter_y, y_step_q4,
-                              w, h);
+  switch (w) {
+    case 4:
+      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+      break;
+    case 8:
+      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+      break;
+    case 16:
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 1);
+      break;
+    case 32:
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h);
+      break;
+    default:
+      vpx_convolve8_avg_horiz_c(src, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+      break;
   }
 }
 #endif

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_dspr2.c
similarity index 97%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve2_dspr2.c
index 92644f2..d111029 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
@@ -27,7 +25,7 @@
                                                  const int16_t *filter_x0,
                                                  int32_t h) {
   int32_t       y;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint8_t       *dst_ptr;
   int32_t       Temp1, Temp2;
   uint32_t      vector4a = 64;
@@ -41,8 +39,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -117,7 +115,7 @@
                                                  const int16_t *filter_x0,
                                                  int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint8_t *dst_ptr;
   uint32_t vector4a = 64;
   int32_t Temp1, Temp2, Temp3;
@@ -132,8 +130,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -257,7 +255,7 @@
   int32_t       c, y;
   const uint8_t *src;
   uint8_t       *dst;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector_64 = 64;
   int32_t       Temp1, Temp2, Temp3;
   uint32_t      qload1, qload2;
@@ -272,8 +270,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -489,7 +487,7 @@
   int32_t       c, y;
   const uint8_t *src;
   uint8_t       *dst;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector_64 = 64;
   int32_t       Temp1, Temp2, Temp3;
   uint32_t      qload1, qload2;
@@ -504,9 +502,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -733,7 +731,7 @@
   }
 }
 
-void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter,
                          int w, int h) {
@@ -747,8 +745,8 @@
   );
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
+  prefetch_load(src);
+  prefetch_load(src + 32);
 
   switch (w) {
     case 4:
@@ -769,7 +767,7 @@
                                             (w/16));
       break;
     case 64:
-      vp9_prefetch_load(src + 32);
+      prefetch_load(src + 32);
       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
                                             dst, dst_stride,
                                             filter, h);

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
similarity index 91%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 1debdb4..9fe1a34 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
@@ -27,7 +25,7 @@
                                       const int16_t *filter_x0,
                                       int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   int32_t Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
   uint32_t tp1, tp2;
@@ -39,9 +37,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -109,7 +107,7 @@
                                       const int16_t *filter_x0,
                                       int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t tp1, tp2, tp3;
@@ -122,9 +120,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -236,7 +234,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2, qload3;
@@ -252,9 +250,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -443,7 +441,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2, qload3;
@@ -459,11 +457,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -643,71 +641,65 @@
   }
 }
 
-void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
-  if (16 == x_step_q4) {
-    uint32_t pos = 38;
+  uint32_t pos = 38;
 
-    vp9_prefetch_load((const uint8_t *)filter_x);
+  assert(x_step_q4 == 16);
 
-    /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+  prefetch_load((const uint8_t *)filter_x);
 
-    /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
 
-    switch (w) {
-      case 4:
-        convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h);
-        break;
-      case 8:
-        convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h);
-        break;
-      case 16:
-        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
-                                   dst, (int32_t)dst_stride,
-                                   filter_x, (int32_t)h, 1);
-        break;
-      case 32:
-        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
-                                   dst, (int32_t)dst_stride,
-                                   filter_x, (int32_t)h, 2);
-        break;
-      case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
-        convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
-                                   dst, (int32_t)dst_stride,
-                                   filter_x, (int32_t)h);
-        break;
-      default:
-        vp9_convolve8_horiz_c(src, src_stride,
-                              dst, dst_stride,
-                              filter_x, x_step_q4,
-                              filter_y, y_step_q4,
-                              w, h);
-        break;
-    }
-  } else {
-    vp9_convolve8_horiz_c(src, src_stride,
-                          dst, dst_stride,
-                          filter_x, x_step_q4,
-                          filter_y, y_step_q4,
-                          w, h);
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+      break;
+    case 16:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h, 1);
+      break;
+    case 32:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+      break;
+    default:
+      vpx_convolve8_horiz_c(src, src_stride,
+                            dst, dst_stride,
+                            filter_x, x_step_q4,
+                            filter_y, y_step_q4,
+                            w, h);
+      break;
   }
 }
 #endif

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
similarity index 85%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
index bf01f11..dde6ffd 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_bi_vert_4_dspr2(const uint8_t *src,
@@ -30,7 +28,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2;
   uint32_t      p1, p2;
@@ -44,7 +42,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -127,7 +125,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2;
   uint32_t      p1, p2;
@@ -141,7 +139,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -215,52 +213,46 @@
   }
 }
 
-void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  if (16 == y_step_q4) {
-    uint32_t pos = 38;
+  uint32_t pos = 38;
 
-    /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+  assert(y_step_q4 == 16);
 
-    vp9_prefetch_store(dst);
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
 
-    switch (w) {
-      case 4 :
-      case 8 :
-      case 16 :
-      case 32 :
-        convolve_bi_vert_4_dspr2(src, src_stride,
-                                 dst, dst_stride,
-                                 filter_y, w, h);
-        break;
-      case 64 :
-        vp9_prefetch_store(dst + 32);
-        convolve_bi_vert_64_dspr2(src, src_stride,
-                                  dst, dst_stride,
-                                  filter_y, h);
-        break;
-      default:
-        vp9_convolve8_vert_c(src, src_stride,
-                             dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, h);
-        break;
-    }
-  } else {
-    vp9_convolve8_vert_c(src, src_stride,
-                         dst, dst_stride,
-                         filter_x, x_step_q4,
-                         filter_y, y_step_q4,
-                         w, h);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4 :
+    case 8 :
+    case 16 :
+    case 32 :
+      convolve_bi_vert_4_dspr2(src, src_stride,
+                               dst, dst_stride,
+                               filter_y, w, h);
+      break;
+    case 64 :
+      prefetch_store(dst + 32);
+      convolve_bi_vert_64_dspr2(src, src_stride,
+                                dst, dst_stride,
+                                filter_y, h);
+      break;
+    default:
+      vpx_convolve8_vert_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+      break;
   }
 }
 #endif

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
similarity index 89%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index ab18490..43da9e5 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
@@ -30,7 +28,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2, load3, load4;
   uint32_t      p1, p2;
@@ -49,7 +47,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -191,7 +189,7 @@
   int32_t       x, y;
   const uint8_t *src_ptr;
   uint8_t       *dst_ptr;
-  uint8_t       *cm = vp9_ff_cropTbl;
+  uint8_t       *cm = vpx_ff_cropTbl;
   uint32_t      vector4a = 64;
   uint32_t      load1, load2, load3, load4;
   uint32_t      p1, p2;
@@ -210,8 +208,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -344,105 +342,89 @@
   }
 }
 
-void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
-  if (((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_avg(src, src_stride,
-                     dst, dst_stride,
-                     filter_x, x_step_q4,
-                     filter_y, y_step_q4,
-                     w, h);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
-    vp9_convolve2_avg_vert_dspr2(src, src_stride,
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve2_avg_vert_dspr2(src, src_stride,
                                  dst, dst_stride,
                                  filter_x, x_step_q4,
                                  filter_y, y_step_q4,
                                  w, h);
   } else {
-    if (16 == y_step_q4) {
-      uint32_t pos = 38;
+    uint32_t pos = 38;
 
-      /* bit positon for extract from acc */
-      __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
-        :
-        : [pos] "r" (pos)
-      );
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
 
-      vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
-      switch (w) {
-        case 4:
-        case 8:
-        case 16:
-        case 32:
-          convolve_avg_vert_4_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_y, w, h);
-          break;
-        case 64:
-          vp9_prefetch_store(dst + 32);
-          convolve_avg_vert_64_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_y, h);
-          break;
-        default:
-          vp9_convolve8_avg_vert_c(src, src_stride,
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_avg_vert_4_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_y, w, h);
+        break;
+      case 64:
+        prefetch_store(dst + 32);
+        convolve_avg_vert_64_dspr2(src, src_stride,
                                    dst, dst_stride,
-                                   filter_x, x_step_q4,
-                                   filter_y, y_step_q4,
-                                   w, h);
-          break;
-      }
-    } else {
-      vp9_convolve8_avg_vert_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+                                   filter_y, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+        break;
     }
   }
 }
 
-void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
 
   assert(w <= 64);
   assert(h <= 64);
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
 
   if (intermediate_height < h)
     intermediate_height = h;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
-
-  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
                       temp, 64,
                       filter_x, x_step_q4,
                       filter_y, y_step_q4,
                       w, intermediate_height);
 
-  vp9_convolve8_avg_vert(temp + 64 * 3, 64,
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64,
                          dst, dst_stride,
                          filter_x, x_step_q4,
                          filter_y, y_step_q4,
                          w, h);
 }
 
-void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int filter_x_stride,
                             const int16_t *filter_y, int filter_y_stride,
@@ -452,17 +434,17 @@
   uint32_t tp3, tp4, tn2;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -482,9 +464,9 @@
     case 8:
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -509,9 +491,9 @@
     case 16:
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -544,9 +526,9 @@
     case 32:
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -593,16 +575,16 @@
       }
       break;
     case 64:
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
similarity index 94%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index 69da1cf..db0c2a4 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
@@ -27,7 +25,7 @@
                                        const int16_t *filter_x0,
                                        int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   int32_t  vector1b, vector2b, vector3b, vector4b;
   int32_t  Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
@@ -43,9 +41,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -149,7 +147,7 @@
                                        const int16_t *filter_x0,
                                        int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   int32_t vector1b, vector2b, vector3b, vector4b;
   int32_t Temp1, Temp2, Temp3;
@@ -165,9 +163,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -339,7 +337,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t filter12, filter34, filter56, filter78;
   int32_t Temp1, Temp2, Temp3;
@@ -357,9 +355,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -650,7 +648,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t filter12, filter34, filter56, filter78;
   int32_t Temp1, Temp2, Temp3;
@@ -668,11 +666,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -954,84 +952,73 @@
   }
 }
 
-void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h) {
-  if (((const int32_t *)filter_x)[1] == 0x800000) {
-    vp9_convolve_avg(src, src_stride,
-                     dst, dst_stride,
-                     filter_x, x_step_q4,
-                     filter_y, y_step_q4,
-                     w, h);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
-    vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride,
                                   dst, dst_stride,
                                   filter_x, x_step_q4,
                                   filter_y, y_step_q4,
                                   w, h);
   } else {
-    if (16 == x_step_q4) {
-      uint32_t pos = 38;
+    uint32_t pos = 38;
 
-      src -= 3;
+    src -= 3;
 
-      /* bit positon for extract from acc */
-      __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
-        :
-        : [pos] "r" (pos)
-      );
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
 
-      /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
-      switch (w) {
-        case 4:
-          convolve_avg_horiz_4_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_x, h);
-          break;
-        case 8:
-          convolve_avg_horiz_8_dspr2(src, src_stride,
-                                     dst, dst_stride,
-                                     filter_x, h);
-          break;
-        case 16:
-          convolve_avg_horiz_16_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h, 1);
-          break;
-        case 32:
-          convolve_avg_horiz_16_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h, 2);
-          break;
-        case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
-
-          convolve_avg_horiz_64_dspr2(src, src_stride,
-                                      dst, dst_stride,
-                                      filter_x, h);
-          break;
-        default:
-          vp9_convolve8_avg_horiz_c(src + 3, src_stride,
+    switch (w) {
+      case 4:
+        convolve_avg_horiz_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+        break;
+      case 8:
+        convolve_avg_horiz_8_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+        break;
+      case 16:
+        convolve_avg_horiz_16_dspr2(src, src_stride,
                                     dst, dst_stride,
-                                    filter_x, x_step_q4,
-                                    filter_y, y_step_q4,
-                                    w, h);
-          break;
-      }
-    } else {
-      vp9_convolve8_avg_horiz_c(src, src_stride,
-                                dst, dst_stride,
-                                filter_x, x_step_q4,
-                                filter_y, y_step_q4,
-                                w, h);
+                                    filter_x, h, 1);
+        break;
+      case 32:
+        convolve_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
+
+        convolve_avg_horiz_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+        break;
     }
   }
 }

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_dspr2.c
similarity index 95%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve8_dspr2.c
index 0ef9dd5..ddad186 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_dspr2.c

@@ -11,31 +11,13 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
-uint8_t *vp9_ff_cropTbl;
-
-void vp9_dsputil_static_init(void) {
-  int i;
-
-  for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
-
-  for (i = 0; i < CROP_WIDTH; i++) {
-    vp9_ff_cropTbl_a[i] = 0;
-    vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
-  }
-
-  vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
-}
-
 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
                                               int32_t src_stride,
                                               uint8_t *dst,
@@ -43,7 +25,7 @@
                                               const int16_t *filter_x0,
                                               int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint8_t *dst_ptr;
   int32_t vector1b, vector2b, vector3b, vector4b;
   int32_t Temp1, Temp2, Temp3, Temp4;
@@ -60,8 +42,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -159,7 +141,7 @@
                                               const int16_t *filter_x0,
                                               int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint8_t *dst_ptr;
   uint32_t vector4a = 64;
   int32_t vector1b, vector2b, vector3b, vector4b;
@@ -176,8 +158,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -338,7 +320,7 @@
   int32_t c, y;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t  filter12, filter34, filter56, filter78;
   int32_t  Temp1, Temp2, Temp3;
@@ -355,8 +337,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -628,7 +610,7 @@
   int32_t c, y;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t  filter12, filter34, filter56, filter78;
   int32_t  Temp1, Temp2, Temp3;
@@ -645,9 +627,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -945,15 +927,21 @@
   }
 }
 
-void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
   uint32_t pos = 38;
 
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+
   /* bit positon for extract from acc */
   __asm__ __volatile__ (
     "wrdsp      %[pos],     1           \n\t"
@@ -964,28 +952,13 @@
   if (intermediate_height < h)
     intermediate_height = h;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
-
-  if ((((const int32_t *)filter_x)[1] == 0x800000)
-      && (((const int32_t *)filter_y)[1] == 0x800000))
-    return vp9_convolve_copy(src, src_stride,
-                             dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, h);
-
   /* copy the src to dst */
   if (filter_x[3] == 0x80) {
     copy_horiz_transposed(src - src_stride * 3, src_stride,
                           temp, intermediate_height,
                           w, intermediate_height);
   } else if (((const int32_t *)filter_x)[0] == 0) {
-    vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+    vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
                         temp, intermediate_height,
                         filter_x,
                         w, intermediate_height);
@@ -993,8 +966,8 @@
     src -= (src_stride * 3 + 3);
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
+    prefetch_load(src);
+    prefetch_load(src + 32);
 
     switch (w) {
       case 4:
@@ -1015,7 +988,7 @@
                                            (w/16));
         break;
       case 64:
-        vp9_prefetch_load(src + 32);
+        prefetch_load(src + 32);
         convolve_horiz_64_transposed_dspr2(src, src_stride,
                                            temp, intermediate_height,
                                            filter_x, intermediate_height);
@@ -1034,7 +1007,7 @@
                           dst, dst_stride,
                           h, w);
   } else if (((const int32_t *)filter_y)[0] == 0) {
-    vp9_convolve2_dspr2(temp + 3, intermediate_height,
+    vpx_convolve2_dspr2(temp + 3, intermediate_height,
                         dst, dst_stride,
                         filter_y,
                         h, w);
@@ -1070,7 +1043,7 @@
   }
 }
 
-void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
                              const int16_t *filter_y, int filter_y_stride,
@@ -1078,9 +1051,9 @@
   int x, y;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
@@ -1089,9 +1062,9 @@
 
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         (%[src])      \n\t"
@@ -1112,9 +1085,9 @@
 
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1137,9 +1110,9 @@
 
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1169,9 +1142,9 @@
 
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1209,16 +1182,16 @@
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
similarity index 93%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 0303896..ae78bab 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_horiz_4_dspr2(const uint8_t *src,
@@ -27,7 +25,7 @@
                                    const int16_t *filter_x0,
                                    int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   int32_t vector1b, vector2b, vector3b, vector4b;
   int32_t Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
@@ -43,9 +41,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -138,7 +136,7 @@
                                    const int16_t *filter_x0,
                                    int32_t h) {
   int32_t y;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   int32_t vector1b, vector2b, vector3b, vector4b;
   int32_t Temp1, Temp2, Temp3;
@@ -154,9 +152,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -305,7 +303,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t filter12, filter34, filter56, filter78;
   int32_t Temp1, Temp2, Temp3;
@@ -323,9 +321,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -575,7 +573,7 @@
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
   int32_t filter12, filter34, filter56, filter78;
   int32_t Temp1, Temp2, Temp3;
@@ -593,11 +591,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -838,85 +836,74 @@
   }
 }
 
-void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
-  if (((const int32_t *)filter_x)[1] == 0x800000) {
-    vp9_convolve_copy(src, src_stride,
-                      dst, dst_stride,
-                      filter_x, x_step_q4,
-                      filter_y, y_step_q4,
-                      w, h);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
-    vp9_convolve2_horiz_dspr2(src, src_stride,
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    vpx_convolve2_horiz_dspr2(src, src_stride,
                               dst, dst_stride,
                               filter_x, x_step_q4,
                               filter_y, y_step_q4,
                               w, h);
   } else {
-    if (16 == x_step_q4) {
-      uint32_t pos = 38;
+    uint32_t pos = 38;
 
-      vp9_prefetch_load((const uint8_t *)filter_x);
-      src -= 3;
+    prefetch_load((const uint8_t *)filter_x);
+    src -= 3;
 
-      /* bit positon for extract from acc */
-      __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
-        :
-        : [pos] "r" (pos)
-      );
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
 
-      /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
-      switch (w) {
-        case 4:
-          convolve_horiz_4_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h);
-          break;
-        case 8:
-          convolve_horiz_8_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h);
-          break;
-        case 16:
-          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h, 1);
-          break;
-        case 32:
-          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h, 2);
-          break;
-        case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+    switch (w) {
+      case 4:
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+                               dst, (int32_t)dst_stride,
+                               filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+                               dst, (int32_t)dst_stride,
+                               filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
-          convolve_horiz_64_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h);
-          break;
-        default:
-          vp9_convolve8_horiz_c(src + 3, src_stride,
-                                dst, dst_stride,
-                                filter_x, x_step_q4,
-                                filter_y, y_step_q4,
-                                w, h);
-          break;
-      }
-    } else {
-      vp9_convolve8_horiz_c(src, src_stride,
-                            dst, dst_stride,
-                            filter_x, x_step_q4,
-                            filter_y, y_step_q4,
-                            w, h);
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src + 3, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+        break;
     }
   }
 }

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
similarity index 89%
rename from libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
rename to libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index 0930bb3..d553828 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c

@@ -11,13 +11,11 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
 static void convolve_vert_4_dspr2(const uint8_t *src,
@@ -30,7 +28,7 @@
   int32_t x, y;
   const uint8_t *src_ptr;
   uint8_t *dst_ptr;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   uint32_t load1, load2, load3, load4;
   uint32_t p1, p2;
@@ -49,7 +47,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -184,7 +182,7 @@
   int32_t x, y;
   const uint8_t *src_ptr;
   uint8_t *dst_ptr;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
   uint32_t load1, load2, load3, load4;
   uint32_t p1, p2;
@@ -203,8 +201,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -330,65 +328,54 @@
   }
 }
 
-void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  if (((const int32_t *)filter_y)[1] == 0x800000) {
-    vp9_convolve_copy(src, src_stride,
-                      dst, dst_stride,
-                      filter_x, x_step_q4,
-                      filter_y, y_step_q4,
-                      w, h);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
-    vp9_convolve2_vert_dspr2(src, src_stride,
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve2_vert_dspr2(src, src_stride,
                              dst, dst_stride,
                              filter_x, x_step_q4,
                              filter_y, y_step_q4,
                              w, h);
   } else {
-    if (16 == y_step_q4) {
-      uint32_t pos = 38;
+    uint32_t pos = 38;
 
-      /* bit positon for extract from acc */
-      __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
-        :
-        : [pos] "r" (pos)
-      );
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
 
-      vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
-      switch (w) {
-        case 4 :
-        case 8 :
-        case 16 :
-        case 32 :
-          convolve_vert_4_dspr2(src, src_stride,
-                                dst, dst_stride,
-                                filter_y, w, h);
-          break;
-        case 64 :
-          vp9_prefetch_store(dst + 32);
-          convolve_vert_64_dspr2(src, src_stride,
-                                 dst, dst_stride,
-                                 filter_y, h);
-          break;
-        default:
-          vp9_convolve8_vert_c(src, src_stride,
+    switch (w) {
+      case 4 :
+      case 8 :
+      case 16 :
+      case 32 :
+        convolve_vert_4_dspr2(src, src_stride,
+                              dst, dst_stride,
+                              filter_y, w, h);
+        break;
+      case 64 :
+        prefetch_store(dst + 32);
+        convolve_vert_64_dspr2(src, src_stride,
                                dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
-          break;
-      }
-    } else {
-      vp9_convolve8_vert_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+                               filter_y, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+        break;
     }
   }
 }

diff --git a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 0000000..66d77a2
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h

@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h);
+
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h);
+
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h);
+
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_

diff --git a/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
new file mode 100644
index 0000000..2115a34
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c

@@ -0,0 +1,955 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 step0, step1, step2, step3;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  v8i16 step0_1, step1_1, step2_1, step3_1;
+
+  /* 1st and 2nd set */
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+  /* 3rd and 4th set */
+  LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 temp0, temp1;
+
+  /* fdct even */
+  LD_SH4(input, 8, in0, in1, in2, in3);
+  LD_SH4(input + 96, 8, in12, in13, in14, in15);
+  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
+              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+  LD_SH4(input + 32, 8, in4, in5, in6, in7);
+  LD_SH4(input + 64, 8, in8, in9, in10, in11);
+  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
+              vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp);
+  ST_SH(temp1, temp + 512);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 256);
+  ST_SH(temp1, temp + 768);
+
+  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 128);
+  ST_SH(temp1, temp + 896);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 640);
+  ST_SH(temp1, temp + 384);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 64);
+  ST_SH(temp1, temp + 960);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 576);
+  ST_SH(temp1, temp + 448);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 320);
+  ST_SH(temp1, temp + 704);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 192);
+  ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(input + 32);
+  in21 = LD_SH(input + 40);
+  in26 = LD_SH(input + 80);
+  in27 = LD_SH(input + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(input + 16);
+  in19 = LD_SH(input + 24);
+  in28 = LD_SH(input + 96);
+  in29 = LD_SH(input + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, input + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, input + 40);
+  vec4 = in29 - in26;
+  ST_SH(vec4, input + 80);
+  vec4 = in28 - in27;
+  ST_SH(vec4, input + 88);
+
+  in21 = in18 + in21;
+  in20 = in19 + in20;
+  in27 = in28 + in27;
+  in26 = in29 + in26;
+
+  LD_SH4(input + 48, 8, in22, in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(input);
+  in17 = LD_SH(input + 8);
+  in30 = LD_SH(input + 112);
+  in31 = LD_SH(input + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, input + 16);
+  vec4 = in16 - in23;
+  ST_SH(vec4, input + 24);
+  vec4 = in31 - in24;
+  ST_SH(vec4, input + 96);
+  vec4 = in30 - in25;
+  ST_SH(vec4, input + 104);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr);
+  ST_SH(vec4, temp_ptr + 960);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 448);
+  ST_SH(vec4, temp_ptr + 512);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 704);
+  ST_SH(vec5, temp_ptr + 256);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 192);
+  ST_SH(vec5, temp_ptr + 768);
+
+  LD_SH4(input + 16, 8, in22, in23, in20, in21);
+  LD_SH4(input + 80, 8, in26, in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 832);
+  ST_SH(vec4, temp_ptr + 128);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 320);
+  ST_SH(vec4, temp_ptr + 640);
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 576);
+  ST_SH(vec4, temp_ptr + 384);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 64);
+  ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+  /* 2nd set */
+  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+         (output + 8 * 8), 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
+       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
+       vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = vec0_r + vec3_r;
+  vec0_r = vec0_r - vec3_r;
+  vec3_r = vec1_r + vec2_r;
+  vec1_r = vec1_r - vec2_r;
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
+                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out, 8);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
+                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out + 16, 8);
+
+  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 32);
+  ST_SH(in5, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 40);
+  ST_SH(in5, out + 48);
+
+  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 64);
+  ST_SH(in5, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 72);
+  ST_SH(in5, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 80);
+  ST_SH(in5, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 96);
+  ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = LD_SH(temp);
+  in4 = LD_SH(temp + 32);
+  in2 = LD_SH(temp + 64);
+  in6 = LD_SH(temp + 96);
+  in1 = LD_SH(temp + 128);
+  in7 = LD_SH(temp + 152);
+  in3 = LD_SH(temp + 192);
+  in5 = LD_SH(temp + 216);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = LD_SH(temp + 16);
+  in1_1 = LD_SH(temp + 232);
+  in2_1 = LD_SH(temp + 80);
+  in3_1 = LD_SH(temp + 168);
+  in4_1 = LD_SH(temp + 48);
+  in5_1 = LD_SH(temp + 176);
+  in6_1 = LD_SH(temp + 112);
+  in7_1 = LD_SH(temp + 240);
+
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = LD_SH(temp + 8);
+  in1 = LD_SH(temp + 136);
+  in2 = LD_SH(temp + 72);
+  in3 = LD_SH(temp + 200);
+  in4 = LD_SH(temp + 40);
+  in5 = LD_SH(temp + 208);
+  in6 = LD_SH(temp + 104);
+  in7 = LD_SH(temp + 144);
+
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 8, 32);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+  /* 4th set */
+  in0_1 = LD_SH(temp + 24);
+  in1_1 = LD_SH(temp + 224);
+  in2_1 = LD_SH(temp + 88);
+  in3_1 = LD_SH(temp + 160);
+  in4_1 = LD_SH(temp + 56);
+  in5_1 = LD_SH(temp + 184);
+  in6_1 = LD_SH(temp + 120);
+  in7_1 = LD_SH(temp + 248);
+
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 24, 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
+                            int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+  temp0 = in0 + in3;
+  in0 = in0 - in3;
+  in3 = in1 + in2;
+  in1 = in1 - in2;
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+  v8i16 vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = in28 + in29;
+  in19 = in31 + in30;
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
+
+void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = LD_HADD(input, stride);
+  out[0] += LD_HADD(input + 8, stride);
+  out[0] += LD_HADD(input + 16, stride);
+  out[0] += LD_HADD(input + 24, stride);
+  out[0] += LD_HADD(input + 32 * 8, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 24, stride);
+  out[0] += LD_HADD(input + 32 * 16, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 24, stride);
+  out[0] += LD_HADD(input + 32 * 24, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 24, stride);
+  out[0] >>= 3;
+}

diff --git a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
new file mode 100644
index 0000000..f66dd5f
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c

@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+                 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+  v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+                   cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+  v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
+                   0, 0, 0, 0 };
+
+  LD_SH16(input, src_stride,
+          in0, in1, in2, in3, in4, in5, in6, in7,
+          in8, in9, in10, in11, in12, in13, in14, in15);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in8, in9, in10, in11, 2);
+  SLLI_4V(in12, in13, in14, in15, 2);
+  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __msa_splati_h(coeff, 0);
+  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+  cnst5 = __msa_splati_h(coeff, 1);
+  cnst5 = __msa_ilvev_h(cnst5, cnst4);
+  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+  /* stp2 */
+  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+  cnst0 = __msa_splati_h(coeff, 4);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  ILVRL_H2_SH(in15, in8, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr);
+
+  cnst0 = __msa_splati_h(coeff2, 0);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 224);
+
+  ILVRL_H2_SH(in14, in9, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 128);
+
+  cnst1 = __msa_splati_h(coeff2, 2);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 96);
+
+  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  cnst1 = __msa_splati_h(coeff, 3);
+  cnst1 = __msa_ilvev_h(cnst0, cnst1);
+  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  /* stp4 */
+  ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+  ILVRL_H2_SH(in13, in10, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 64);
+
+  cnst0 = __msa_splati_h(coeff2, 1);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 160);
+
+  SUB2(stp34, stp25, stp33, stp22, in12, in11);
+  ILVRL_H2_SH(in12, in11, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 192);
+
+  cnst1 = __msa_splati_h(coeff2, 3);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 32);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+  SRA_4V(in0, in1, in2, in3, 2);
+  SRA_4V(in4, in5, in6, in7, 2);
+  SRA_4V(in8, in9, in10, in11, 2);
+  SRA_4V(in12, in13, in14, in15, 2);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
+               tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void vpx_fdct4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 vec, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    vec = __msa_ceqi_h(in0, 0);
+    vec = vec ^ 255;
+    vec = mask & vec;
+    in0 += vec;
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}
+
+void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+            in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+            in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
+
+void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[0] = LD_HADD(input, stride);
+  out[1] = 0;
+}
+
+void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+
+void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = LD_HADD(input, stride);
+  out[0] += LD_HADD(input + 8, stride);
+  out[0] += LD_HADD(input + 16 * 8, stride);
+  out[0] += LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] >>= 1;
+}

diff --git a/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
new file mode 100644
index 0000000..d1e160e
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/fwd_txfm_msa.h

@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define LD_HADD(psrc, stride) ({                                      \
+  v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;       \
+  v4i32 vec_w_m;                                                      \
+                                                                      \
+  LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                 \
+  ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                     \
+  LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);  \
+  ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m,        \
+       in4_m, in6_m, in0_m, in4_m);                                   \
+  in0_m += in4_m;                                                     \
+                                                                      \
+  vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                             \
+  HADD_SW_S32(vec_w_m);                                               \
+})
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) {     \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                         \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                             \
+  v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                             \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,          \
+                    cospi_24_64, -cospi_8_64, 0, 0, 0 };            \
+                                                                    \
+  BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);  \
+  ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);       \
+  SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                    \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                        \
+  vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                         \
+                                                                    \
+  SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                    \
+  cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                        \
+  vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
+                                                                    \
+  vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                         \
+  cnst2_m = __msa_splati_h(coeff_m, 2);                             \
+  cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                        \
+  vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
+                                                                    \
+  SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);      \
+  PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m,       \
+              vec7_m, vec7_m, out0, out2, out1, out3);              \
+}
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) {        \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
+                                                                         \
+  SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);    \
+  SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);    \
+  AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3,         \
+             in0, in1, in2, in3);                                        \
+  AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7,         \
+             in4, in5, in6, in7);                                        \
+}
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                    \
+  v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,           \
+                    cospi_24_64, cospi_4_64, cospi_28_64,            \
+                    cospi_12_64, cospi_20_64 };                      \
+                                                                     \
+  /* FDCT stage1 */                                                  \
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                \
+              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);       \
+  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);       \
+  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                    \
+  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                    \
+  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                           \
+  x1_m = __msa_ilvev_h(x1_m, x0_m);                                  \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                           \
+  x2_m = -x2_m;                                                      \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
+  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
+                                                                     \
+  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
+  x2_m = __msa_splati_h(coeff_m, 2);                                 \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
+                                                                     \
+  /* stage2 */                                                       \
+  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                               \
+                                                                     \
+  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
+  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
+                                                                     \
+  /* stage3 */                                                       \
+  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);       \
+                                                                     \
+  /* stage4 */                                                       \
+  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                    \
+  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                           \
+  x1_m = __msa_ilvev_h(x0_m, x1_m);                                  \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                           \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
+                                                                     \
+  x1_m = __msa_splati_h(coeff_m, 5);                                 \
+  x0_m = -x0_m;                                                      \
+  x0_m = __msa_ilvev_h(x1_m, x0_m);                                  \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                    \
+                                                                     \
+  x2_m = __msa_splati_h(coeff_m, 6);                                 \
+  x3_m = -x3_m;                                                      \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
+}
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7,                \
+                      out0, out1, out2, out3, out4, out5, out6, out7) {      \
+  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                      \
+  v8i16 x0_m, x1_m, x2_m, x3_m;                                              \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,      \
+                    cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 };     \
+                                                                             \
+  /* FDCT stage1 */                                                          \
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                        \
+              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);               \
+  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);               \
+  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                            \
+  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                            \
+  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                   \
+  x2_m = -x2_m;                                                              \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+                                                                             \
+  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
+  x2_m = __msa_splati_h(coeff_m, 2);                                         \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+                                                                             \
+  /* stage2 */                                                               \
+  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                       \
+                                                                             \
+  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
+  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+                                                                             \
+  /* stage3 */                                                               \
+  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);               \
+                                                                             \
+  /* stage4 */                                                               \
+  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                            \
+  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x0_m, x1_m);                                          \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                   \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
+                                                                             \
+  x1_m = __msa_splati_h(coeff_m, 5);                                         \
+  x0_m = -x0_m;                                                              \
+  x0_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                            \
+                                                                             \
+  x2_m = __msa_splati_h(coeff_m, 6);                                         \
+  x3_m = -x3_m;                                                              \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
+}
+
+#define FDCT8x16_ODD(input0, input1, input2, input3,               \
+                     input4, input5, input6, input7,               \
+                     out1, out3, out5, out7,                       \
+                     out9, out11, out13, out15) {                  \
+  v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;      \
+  v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;      \
+  v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                          \
+  v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                    \
+  v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                        \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,         \
+                    cospi_24_64, -cospi_8_64, -cospi_24_64,        \
+                    cospi_12_64, cospi_20_64 };                    \
+  v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64,         \
+                     cospi_18_64, cospi_10_64, cospi_22_64,        \
+                     cospi_6_64, cospi_26_64 };                    \
+  v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64,      \
+                     -cospi_26_64, 0, 0, 0, 0 };                   \
+                                                                   \
+  /* stp 1 */                                                      \
+  ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);      \
+  ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);      \
+                                                                   \
+  cnst4_m = __msa_splati_h(coeff_m, 0);                            \
+  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);        \
+                                                                   \
+  cnst5_m = __msa_splati_h(coeff_m, 1);                            \
+  cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                       \
+  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);        \
+  stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);        \
+  stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);        \
+                                                                   \
+  /* stp2 */                                                       \
+  BUTTERFLY_4(input0, input1, stp22_m, stp23_m,                    \
+              stp30_m, stp31_m, stp32_m, stp33_m);                 \
+  BUTTERFLY_4(input7, input6, stp25_m, stp24_m,                    \
+              stp37_m, stp36_m, stp35_m, stp34_m);                 \
+                                                                   \
+  ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);  \
+  ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);  \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                   \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);        \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 4);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);        \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                   \
+  cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 3);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
+                                                                   \
+  /* stp4 */                                                       \
+  BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m,                  \
+              vec6_m, vec2_m, vec4_m, vec5_m);                     \
+  BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m,                  \
+              stp21_m, stp23_m, stp24_m, stp31_m);                 \
+                                                                   \
+  ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+                                                                   \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 0);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
+                                                                   \
+  ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);           \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 2);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 1);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
+                                                                   \
+  ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);          \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 3);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+}
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) {      \
+  v8i16 tp0_m, tp1_m;                             \
+  v8i16 one_m = __msa_ldi_h(1);                   \
+                                                  \
+  tp0_m = __msa_clti_s_h(vec0, 0);                \
+  tp1_m = __msa_clti_s_h(vec1, 0);                \
+  vec0 += 1;                                      \
+  vec1 += 1;                                      \
+  tp0_m = one_m & tp0_m;                          \
+  tp1_m = one_m & tp1_m;                          \
+  vec0 += tp0_m;                                  \
+  vec1 += tp1_m;                                  \
+  vec0 >>= 2;                                     \
+  vec1 >>= 2;                                     \
+}
+
+#define FDCT32_POSTPROC_NEG_W(vec) {      \
+  v4i32 temp_m;                           \
+  v4i32 one_m = __msa_ldi_w(1);           \
+                                          \
+  temp_m = __msa_clti_s_w(vec, 0);        \
+  vec += 1;                               \
+  temp_m = one_m & temp_m;                \
+  vec += temp_m;                          \
+  vec >>= 2;                              \
+}
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) {      \
+  v8i16 tp0_m, tp1_m;                               \
+  v8i16 one = __msa_ldi_h(1);                       \
+                                                    \
+  tp0_m = __msa_clei_s_h(vec0, 0);                  \
+  tp1_m = __msa_clei_s_h(vec1, 0);                  \
+  tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255);   \
+  tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255);   \
+  vec0 += 1;                                        \
+  vec1 += 1;                                        \
+  tp0_m = one & tp0_m;                              \
+  tp1_m = one & tp1_m;                              \
+  vec0 += tp0_m;                                    \
+  vec1 += tp1_m;                                    \
+  vec0 >>= 2;                                       \
+  vec1 >>= 2;                                       \
+}
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right,      \
+                          reg1_right, const0, const1,            \
+                          out0, out1, out2, out3) {              \
+  v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;          \
+  v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                              \
+  v4i32 k0_m = __msa_fill_w((int32_t) const0);                   \
+                                                                 \
+  s0_m = __msa_fill_w((int32_t) const1);                         \
+  k0_m = __msa_ilvev_w(s0_m, k0_m);                              \
+                                                                 \
+  ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                \
+  ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                 \
+  ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);              \
+  ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);               \
+                                                                 \
+  DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+                                                                 \
+  DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+}
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif  // VPX_DSP_MIPS_FWD_TXFM_MSA_H_

diff --git a/libvpx/vpx_dsp/mips/idct16x16_msa.c b/libvpx/vpx_dsp/mips/idct16x16_msa.c
new file mode 100644
index 0000000..5faac71
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/idct16x16_msa.c

@@ -0,0 +1,487 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  input += 8;
+  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                     reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
+                     reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+  SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
+       reg8);
+  ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
+       reg10);
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  /* transpose block */
+  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                     reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+  ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
+
+  /* transpose block */
+  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                     reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+  ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
+}
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                      int32_t dst_stride) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  /* load up 8x8 */
+  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  input += 8 * 16;
+  /* load bottom 8x8 */
+  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+  reg0 = reg2 - loc1;
+  reg2 = reg2 + loc1;
+  reg12 = reg14 - loc0;
+  reg14 = reg14 + loc0;
+  reg4 = reg6 - loc3;
+  reg6 = reg6 + loc3;
+  reg8 = reg10 - loc2;
+  reg10 = reg10 + loc2;
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                               int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 2; ++i) {
+    /* process 16 * 8 block */
+    vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                     dst_stride);
+  }
+}
+
+void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  uint8_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* process 16 * 8 block */
+  vpx_idct16_1d_rows_msa(input, out);
+
+  /* short case just considers top 4 rows as valid output */
+  out += 4 * 16;
+  for (i = 12; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,   0(%[out])     \n\t"
+        "sw     $zero,   4(%[out])     \n\t"
+        "sw     $zero,   8(%[out])     \n\t"
+        "sw     $zero,  12(%[out])     \n\t"
+        "sw     $zero,  16(%[out])     \n\t"
+        "sw     $zero,  20(%[out])     \n\t"
+        "sw     $zero,  24(%[out])     \n\t"
+        "sw     $zero,  28(%[out])     \n\t"
+
+        :
+        : [out] "r" (out)
+    );
+
+    out += 16;
+  }
+
+  out = out_arr;
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                     dst_stride);
+  }
+}
+
+void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  uint8_t i;
+  int16_t out;
+  v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __msa_fill_h(out);
+
+  for (i = 4; i--;) {
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8,
+          l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     l8, l9, l10, l11, l12, l13, l14, l15);
+
+  /* ADST in horizontal */
+  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                   l8, l9, l10, l11, l12, l13, l14, l15,
+                   r0, r1, r2, r3, r4, r5, r6, r7,
+                   r8, r9, r10, r11, r12, r13, r14, r15);
+
+  l1 = -r8;
+  l3 = -r4;
+  l13 = -r13;
+  l15 = -r1;
+
+  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                     l0, l1, l2, l3, l4, l5, l6, l7);
+  ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                     l8, l9, l10, l11, l12, l13, l14, l15);
+  ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride) {
+  v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+  v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+  v16i8 zero = { 0 };
+
+  r0 = LD_SH(input + 0 * 16);
+  r3 = LD_SH(input + 3 * 16);
+  r4 = LD_SH(input + 4 * 16);
+  r7 = LD_SH(input + 7 * 16);
+  r8 = LD_SH(input + 8 * 16);
+  r11 = LD_SH(input + 11 * 16);
+  r12 = LD_SH(input + 12 * 16);
+  r15 = LD_SH(input + 15 * 16);
+
+  /* stage 1 */
+  k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+  k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+  BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+  k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  r1 = LD_SH(input + 1 * 16);
+  r2 = LD_SH(input + 2 * 16);
+  r5 = LD_SH(input + 5 * 16);
+  r6 = LD_SH(input + 6 * 16);
+  r9 = LD_SH(input + 9 * 16);
+  r10 = LD_SH(input + 10 * 16);
+  r13 = LD_SH(input + 13 * 16);
+  r14 = LD_SH(input + 14 * 16);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+  k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+  BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+  BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+  out1 = -out1;
+  SRARI_H2_SH(out0, out1, 6);
+  dst0 = LD_UB(dst + 0 * dst_stride);
+  dst1 = LD_UB(dst + 15 * dst_stride);
+  ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+  ADD2(res0, out0, res1, out1, res0, res1);
+  CLIP_SH2_0_255(res0, res1);
+  PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+  ST8x1_UB(res0, dst);
+  ST8x1_UB(res1, dst + 15 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+  k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+
+  SRARI_H2_SH(out8, out9, 6);
+  dst8 = LD_UB(dst + 1 * dst_stride);
+  dst9 = LD_UB(dst + 14 * dst_stride);
+  ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+  ADD2(res8, out8, res9, out9, res8, res9);
+  CLIP_SH2_0_255(res8, res9);
+  PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+  ST8x1_UB(res8, dst + dst_stride);
+  ST8x1_UB(res9, dst + 14 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+  MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  SRARI_H2_SH(out4, out5, 6);
+  dst4 = LD_UB(dst + 3 * dst_stride);
+  dst5 = LD_UB(dst + 12 * dst_stride);
+  ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+  ADD2(res4, out4, res5, out5, res4, res5);
+  CLIP_SH2_0_255(res4, res5);
+  PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+  ST8x1_UB(res4, dst + 3 * dst_stride);
+  ST8x1_UB(res5, dst + 12 * dst_stride);
+
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  SRARI_H2_SH(out12, out13, 6);
+  dst12 = LD_UB(dst + 2 * dst_stride);
+  dst13 = LD_UB(dst + 13 * dst_stride);
+  ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+  ADD2(res12, out12, res13, out13, res12, res13);
+  CLIP_SH2_0_255(res12, res13);
+  PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+  ST8x1_UB(res12, dst + 2 * dst_stride);
+  ST8x1_UB(res13, dst + 13 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+  k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  SRARI_H2_SH(out6, out7, 6);
+  dst6 = LD_UB(dst + 4 * dst_stride);
+  dst7 = LD_UB(dst + 11 * dst_stride);
+  ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+  ADD2(res6, out6, res7, out7, res6, res7);
+  CLIP_SH2_0_255(res6, res7);
+  PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+  ST8x1_UB(res6, dst + 4 * dst_stride);
+  ST8x1_UB(res7, dst + 11 * dst_stride);
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  SRARI_H2_SH(out10, out11, 6);
+  dst10 = LD_UB(dst + 6 * dst_stride);
+  dst11 = LD_UB(dst + 9 * dst_stride);
+  ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+  ADD2(res10, out10, res11, out11, res10, res11);
+  CLIP_SH2_0_255(res10, res11);
+  PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+  ST8x1_UB(res10, dst + 6 * dst_stride);
+  ST8x1_UB(res11, dst + 9 * dst_stride);
+
+  k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  SRARI_H2_SH(out2, out3, 6);
+  dst2 = LD_UB(dst + 7 * dst_stride);
+  dst3 = LD_UB(dst + 8 * dst_stride);
+  ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+  ADD2(res2, out2, res3, out3, res2, res3);
+  CLIP_SH2_0_255(res2, res3);
+  PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+  ST8x1_UB(res2, dst + 7 * dst_stride);
+  ST8x1_UB(res3, dst + 8 * dst_stride);
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  SRARI_H2_SH(out14, out15, 6);
+  dst14 = LD_UB(dst + 5 * dst_stride);
+  dst15 = LD_UB(dst + 10 * dst_stride);
+  ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+  ADD2(res14, out14, res15, out15, res14, res15);
+  CLIP_SH2_0_255(res14, res15);
+  PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+  ST8x1_UB(res14, dst + 5 * dst_stride);
+  ST8x1_UB(res15, dst + 10 * dst_stride);
+}

diff --git a/libvpx/vpx_dsp/mips/idct32x32_msa.c b/libvpx/vpx_dsp/mips/idct32x32_msa.c
new file mode 100644
index 0000000..d5b3966
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/idct32x32_msa.c

@@ -0,0 +1,739 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
+  ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
+  ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
+
+  /* 3rd & 4th 8x8 */
+  LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
+  ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
+  ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
+  ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 15 * 8));
+  ST_SH(loc1, (tmp_eve_buf));
+  ST_SH(loc2, (tmp_eve_buf + 14 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 8));
+
+  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 13 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 2 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 12 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+  /* Store 8 */
+  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 11 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 4 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 10 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 9 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 6 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 8 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LD_SH(tmp_buf + 8);
+  reg1 = LD_SH(tmp_buf + 7 * 8);
+  reg2 = LD_SH(tmp_buf + 9 * 8);
+  reg3 = LD_SH(tmp_buf + 15 * 8);
+  reg4 = LD_SH(tmp_buf + 17 * 8);
+  reg5 = LD_SH(tmp_buf + 23 * 8);
+  reg6 = LD_SH(tmp_buf + 25 * 8);
+  reg7 = LD_SH(tmp_buf + 31 * 8);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  reg0 = LD_SH(tmp_buf + 3 * 8);
+  reg1 = LD_SH(tmp_buf + 5 * 8);
+  reg2 = LD_SH(tmp_buf + 11 * 8);
+  reg3 = LD_SH(tmp_buf + 13 * 8);
+  reg4 = LD_SH(tmp_buf + 19 * 8);
+  reg5 = LD_SH(tmp_buf + 21 * 8);
+  reg6 = LD_SH(tmp_buf + 27 * 8);
+  reg7 = LD_SH(tmp_buf + 29 * 8);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+       vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+  /* 4 Stores */
+  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+       vec1, vec2, vec0, vec3);
+  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  ST_SH(reg0, (tmp_odd_buf + 13 * 8));
+  ST_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+       loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Load 8 & Store 8 */
+  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+       loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf,
+                                           int16_t *dst) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  vec0 = LD_SH(tmp_odd_buf);
+  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LD_SH(tmp_eve_buf);
+  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+  ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+  ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+  /* 3rd & 4th 8x8 */
+  LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+  ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+  ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
+                                 &tmp_odd_buf[0], output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  tmp_buf += (2 * 32);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+  /* Store 8 */
+  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LD_SH(tmp_buf + 32);
+  reg1 = LD_SH(tmp_buf + 7 * 32);
+  reg2 = LD_SH(tmp_buf + 9 * 32);
+  reg3 = LD_SH(tmp_buf + 15 * 32);
+  reg4 = LD_SH(tmp_buf + 17 * 32);
+  reg5 = LD_SH(tmp_buf + 23 * 32);
+  reg6 = LD_SH(tmp_buf + 25 * 32);
+  reg7 = LD_SH(tmp_buf + 31 * 32);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  reg0 = LD_SH(tmp_buf + 3 * 32);
+  reg1 = LD_SH(tmp_buf + 5 * 32);
+  reg2 = LD_SH(tmp_buf + 11 * 32);
+  reg3 = LD_SH(tmp_buf + 13 * 32);
+  reg4 = LD_SH(tmp_buf + 19 * 32);
+  reg5 = LD_SH(tmp_buf + 21 * 32);
+  reg6 = LD_SH(tmp_buf + 27 * 32);
+  reg7 = LD_SH(tmp_buf + 29 * 32);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+  /* 4 Stores */
+  ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
+  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Load 8 & Store 8 */
+  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf,
+                                             uint8_t *dst,
+                                             int32_t dst_stride) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  vec0 = LD_SH(tmp_odd_buf);
+  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LD_SH(tmp_eve_buf);
+  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+  SRARI_H4_SH(m0, m2, m4, m6, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+  SRARI_H4_SH(m0, m2, m4, m6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
+                      m0, m2, m4, m6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+  SRARI_H4_SH(m1, m3, m5, m7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
+                      m1, m3, m5, m7);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+  SRARI_H4_SH(m1, m3, m5, m7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
+                      m1, m3, m5, m7);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+  SRARI_H4_SH(n0, n2, n4, n6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
+                      n0, n2, n4, n6);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+  SRARI_H4_SH(n0, n2, n4, n6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
+                      n0, n2, n4, n6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+  SRARI_H4_SH(n1, n3, n5, n7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
+                      n1, n3, n5, n7);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+  SRARI_H4_SH(n1, n3, n5, n7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
+                      n1, n3, n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                   dst, dst_stride);
+}
+
+void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  for (i = 32; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[out_ptr])     \n\t"
+        "sw     $zero,      4(%[out_ptr])     \n\t"
+        "sw     $zero,      8(%[out_ptr])     \n\t"
+        "sw     $zero,     12(%[out_ptr])     \n\t"
+        "sw     $zero,     16(%[out_ptr])     \n\t"
+        "sw     $zero,     20(%[out_ptr])     \n\t"
+        "sw     $zero,     24(%[out_ptr])     \n\t"
+        "sw     $zero,     28(%[out_ptr])     \n\t"
+        "sw     $zero,     32(%[out_ptr])     \n\t"
+        "sw     $zero,     36(%[out_ptr])     \n\t"
+        "sw     $zero,     40(%[out_ptr])     \n\t"
+        "sw     $zero,     44(%[out_ptr])     \n\t"
+        "sw     $zero,     48(%[out_ptr])     \n\t"
+        "sw     $zero,     52(%[out_ptr])     \n\t"
+        "sw     $zero,     56(%[out_ptr])     \n\t"
+        "sw     $zero,     60(%[out_ptr])     \n\t"
+
+        :
+        : [out_ptr] "r" (out_ptr)
+    );
+
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_msa(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __msa_fill_h(out);
+
+  for (i = 16; i--;) {
+    LD_UB2(dst, 16, dst0, dst1);
+    LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                tmp0, tmp1, tmp2, tmp3);
+
+    ST_UB2(tmp0, tmp1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(tmp2, tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/idct4x4_msa.c b/libvpx/vpx_dsp/mips/idct4x4_msa.c
new file mode 100644
index 0000000..f289d8e
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/idct4x4_msa.c

@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3;
+  v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in2, in3, in1);
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+  UNPCK_R_SH_SW(in0, in0_r);
+  UNPCK_R_SH_SW(in2, in2_r);
+  UNPCK_R_SH_SW(in3, in3_r);
+  UNPCK_R_SH_SW(in1, in1_r);
+  SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
+
+  in0_r += in2_r;
+  in3_r -= in1_r;
+  in4_r = (in0_r - in3_r) >> 1;
+  in1_r = in4_r - in1_r;
+  in2_r = in4_r - in2_r;
+  in0_r -= in1_r;
+  in3_r += in2_r;
+
+  TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
+
+  in0_r += in1_r;
+  in2_r -= in3_r;
+  in4_r = (in0_r - in2_r) >> 1;
+  in3_r = in4_r - in3_r;
+  in1_r = in4_r - in1_r;
+  in0_r -= in3_r;
+  in2_r += in1_r;
+
+  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r,
+              in0, in1, in2, in3);
+  ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
+}
+
+void vpx_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t a1, e1;
+  v8i16 in1, in0 = { 0 };
+
+  a1 = input[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+
+  in0 = __msa_insert_h(in0, 0, a1);
+  in0 = __msa_insert_h(in0, 1, e1);
+  in0 = __msa_insert_h(in0, 2, e1);
+  in0 = __msa_insert_h(in0, 3, e1);
+
+  in1 = in0 >> 1;
+  in0 -= in1;
+
+  ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
+}
+
+void vpx_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  /* rows */
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+  /* columns */
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+  /* rounding (add 2^3, divide by 2^4) */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+void vpx_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t out;
+  v8i16 vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 4);
+  vec = __msa_fill_h(out);
+
+  ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}

diff --git a/libvpx/vpx_dsp/mips/idct8x8_msa.c b/libvpx/vpx_dsp/mips/idct8x8_msa.c
new file mode 100644
index 0000000..fd667e4
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/idct8x8_msa.c

@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* rows transform */
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  /* 1D idct8x8 */
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+  /* columns transform */
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  /* 1D idct8x8 */
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+  v4i32 tmp0, tmp1, tmp2, tmp3;
+  v8i16 zero = { 0 };
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  /* stage1 */
+  ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+  k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+  BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+  /* stage2 */
+  ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+  k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+  BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+  /* stage3 */
+  s0 = __msa_ilvr_h(s6, s5);
+
+  k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+  DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+  SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+  /* stage4 */
+  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
+              in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t out;
+  int32_t val;
+  v8i16 vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  val = ROUND_POWER_OF_TWO(out, 5);
+  vec = __msa_fill_h(val);
+
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
similarity index 97%
rename from libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
rename to libvpx/vpx_dsp/mips/intrapred16_dspr2.c
index b0dc496..11444c7 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
+++ b/libvpx/vpx_dsp/mips/intrapred16_dspr2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -7,14 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
   int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
@@ -161,7 +158,7 @@
   );
 }
 
-void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
   int32_t  expected_dc;
   int32_t  average;

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
similarity index 96%
rename from libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
rename to libvpx/vpx_dsp/mips/intrapred4_dspr2.c
index a53c623..03baf4c 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
+++ b/libvpx/vpx_dsp/mips/intrapred4_dspr2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -7,14 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t  tmp1, tmp2, tmp3, tmp4;
 
@@ -41,7 +38,7 @@
   );
 }
 
-void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   int32_t  expected_dc;
   int32_t  average;
@@ -83,7 +80,7 @@
   );
 }
 
-void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   int32_t  abovel, abover;
   int32_t  left0, left1, left2, left3;
@@ -91,7 +88,7 @@
   int32_t  resl;
   int32_t  resr;
   int32_t  top_left;
-  uint8_t  *cm = vp9_ff_cropTbl;
+  uint8_t  *cm = vpx_ff_cropTbl;
 
   __asm__ __volatile__ (
       "ulw             %[resl],       (%[above])                         \n\t"

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
similarity index 98%
rename from libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
rename to libvpx/vpx_dsp/mips/intrapred8_dspr2.c
index 40d93ae..196ff5a 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
+++ b/libvpx/vpx_dsp/mips/intrapred8_dspr2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -7,14 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #if HAVE_DSPR2
-void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 
@@ -70,7 +67,7 @@
   );
 }
 
-void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   int32_t  expected_dc;
   int32_t  average;
@@ -153,7 +150,7 @@
   );
 }
 
-void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   int32_t   abovel, abover;
   int32_t   abovel_1, abover_1;
@@ -161,7 +158,7 @@
   int32_t   res0, res1, res2, res3;
   int32_t   reshw;
   int32_t   top_left;
-  uint8_t   *cm = vp9_ff_cropTbl;
+  uint8_t   *cm = vpx_ff_cropTbl;
 
   __asm__ __volatile__ (
       "ulw             %[reshw],       (%[above])                         \n\t"

diff --git a/libvpx/vpx_dsp/mips/intrapred_msa.c b/libvpx/vpx_dsp/mips/intrapred_msa.c
new file mode 100644
index 0000000..f6fbe40
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/intrapred_msa.c

@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
+  out0 = __msa_subs_u_h(out0, in0);                \
+  out1 = __msa_subs_u_h(out1, in1);                \
+}
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v16u8 src0, src1, src2, src3;
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+  val = LW(src_top_ptr);
+  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+  src_left0 = __msa_fill_b(src_left[0]);
+  src_left1 = __msa_fill_b(src_left[1]);
+  src_left2 = __msa_fill_b(src_left[2]);
+  src_left3 = __msa_fill_b(src_left[3]);
+
+  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+             src_left3, src_top, src0, src1, src2, src3);
+  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+  v16u8 src0, src1, src2, src3;
+
+  val = LD(src_top_ptr);
+  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 2; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r, res_l;
+
+  src_top = LD_SB(src_top_ptr);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+  LD_SB2(src_top, 16, src_top0, src_top1);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}

diff --git a/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
new file mode 100644
index 0000000..abd8509
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h

@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
+                                                                               \
+  int32_t tmp, out;                                                            \
+  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
+  int     in = input;                                                          \
+                                                                               \
+  __asm__ __volatile__ (                                                       \
+      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
+      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
+      "mthi     $zero,                  $ac1                              \n\t"\
+      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
+      "extp     %[tmp],                 $ac1,             31              \n\t"\
+                                                                               \
+      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
+      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
+      "mthi     $zero,                  $ac2                              \n\t"\
+      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
+      "extp     %[out],                 $ac2,             31              \n\t"\
+                                                                               \
+      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
+      : [in] "r" (in),                                                         \
+        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
+        [cospi_16_64] "r" (cospi_16_64)                                        \
+   );                                                                          \
+  out;                                                                    })
+
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride);
+void iadst4_dspr2(const int16_t *input, int16_t *output);
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride);
+void iadst8_dspr2(const int16_t *input, int16_t *output);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride);
+void iadst16_dspr2(const int16_t *input, int16_t *output);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_

diff --git a/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libvpx/vpx_dsp/mips/inv_txfm_msa.h
new file mode 100644
index 0000000..1458561
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/inv_txfm_msa.h

@@ -0,0 +1,410 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#define VPX_DSP_MIPS_INV_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,               \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {     \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
+  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
+    cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };               \
+  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
+    -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                    \
+                                                                        \
+  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
+  cnst2_m = -cnst0_m;                                                   \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
+  cnst4_m = -cnst2_m;                                                   \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                        \
+  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst1_m, cnst2_m, cnst3_m, in7, in0,            \
+                        in4, in3);                                      \
+                                                                        \
+  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
+  cnst2_m = -cnst0_m;                                                   \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
+  cnst4_m = -cnst2_m;                                                   \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                        \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+                                                                        \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst1_m, cnst2_m, cnst3_m, in5, in2,            \
+                        in6, in1);                                      \
+  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
+  out7 = -s0_m;                                                         \
+  out0 = s1_m;                                                          \
+                                                                        \
+  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
+               cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
+                                                                        \
+  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+  cnst1_m = cnst0_m;                                                    \
+                                                                        \
+  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst2_m, cnst3_m, cnst1_m, out1, out6,          \
+                        s0_m, s1_m);                                    \
+                                                                        \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+                                                                        \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                \
+                                                                        \
+  out1 = -out1;                                                         \
+  out3 = -out3;                                                         \
+  out5 = -out5;                                                         \
+}
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({  \
+  v8i16 out0_m, r0_m, r1_m;                \
+                                           \
+  r0_m = __msa_fill_h(c0_h);               \
+  r1_m = __msa_fill_h(c1_h);               \
+  out0_m = __msa_ilvev_h(r1_m, r0_m);      \
+                                           \
+  out0_m;                                  \
+})
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) {  \
+  uint8_t *dst_m = (uint8_t *) (dst);                               \
+  v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                             \
+  v16i8 tmp0_m, tmp1_m;                                             \
+  v16i8 zero_m = { 0 };                                             \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                             \
+                                                                    \
+  LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);        \
+  ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,        \
+             zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);       \
+  ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,          \
+       res0_m, res1_m, res2_m, res3_m);                             \
+  CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                   \
+  PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);      \
+  ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                      \
+}
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) {   \
+  v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
+  v8i16 step0_m, step1_m;                                           \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+                                                                    \
+  c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+  c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+  step0_m = __msa_ilvr_h(in2, in0);                                 \
+  DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
+                                                                    \
+  c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+  c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+  step1_m = __msa_ilvr_h(in3, in1);                                 \
+  DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);      \
+                                                                    \
+  PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
+  SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
+  BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m,                         \
+              (v8i16)tmp2_m, (v8i16)tmp3_m,                         \
+              out0, out1, out2, out3);                              \
+}
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
+  v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
+  v8i16 zero_m = { 0 };                                             \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+  v4i32 int0_m, int1_m, int2_m, int3_m;                             \
+  v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
+    sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                  \
+    -sinpi_4_9 };                                                   \
+                                                                    \
+  SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
+  ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
+  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
+  int0_m = tmp2_m + tmp1_m;                                         \
+                                                                    \
+  SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
+  ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+  int1_m = tmp0_m + tmp1_m;                                         \
+                                                                    \
+  c0_m = __msa_splati_h(mask_m, 6);                                 \
+  ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
+  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+  int2_m = tmp0_m + tmp1_m;                                         \
+                                                                    \
+  c0_m = __msa_splati_h(mask_m, 6);                                 \
+  c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
+                                                                    \
+  res0_m = __msa_ilvr_h((in1), (in3));                              \
+  tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
+  int3_m = tmp2_m + tmp0_m;                                         \
+                                                                    \
+  res0_m = __msa_ilvr_h((in2), (in3));                              \
+  c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
+                                                                    \
+  tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
+  res1_m = __msa_ilvr_h((in0), (in2));                              \
+  c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
+                                                                    \
+  tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
+  int3_m += tmp2_m;                                                 \
+  int3_m += tmp3_m;                                                 \
+                                                                    \
+  SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);      \
+  PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
+  PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
+}
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({  \
+  v8i16 c0_m, c1_m;                                    \
+                                                       \
+  SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);    \
+  c0_m = __msa_ilvev_h(c1_m, c0_m);                    \
+                                                       \
+  c0_m;                                                \
+})
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,        \
+                 out0, out1, out2, out3) {                              \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                        \
+  ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
+  ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
+  DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
+              cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
+  DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
+              cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
+}
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,               \
+                       out0, out1, out2, out3, out4, out5, out6, out7) {     \
+  v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
+  v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+  v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
+    cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                  \
+                                                                             \
+  k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
+  k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
+  k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
+  k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
+  VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
+  SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
+  k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
+  k1_m = __msa_splati_h(mask_m, 4);                                          \
+                                                                             \
+  ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
+  DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
+              tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);               \
+  tp4_m = in1 + in3;                                                         \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
+  tp7_m = in7 + in5;                                                         \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+  VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
+           in0, in4, in2, in6);                                              \
+  BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
+  BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
+              out0, out1, out2, out3, out4, out5, out6, out7);               \
+}
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
+  v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
+  v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
+  v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
+    cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };    \
+  v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
+    cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };      \
+  v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
+    -cospi_16_64, 0, 0, 0, 0 };                                            \
+                                                                           \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
+  ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
+  ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
+  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
+  ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
+  ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
+  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
+  ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
+  BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
+  k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
+  ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
+  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
+  SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
+  k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
+  ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              m0_m, m1_m, m2_m, m3_m);                                     \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
+  ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
+  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+              m0_m, m1_m, m2_m, m3_m);                                     \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
+                                                                           \
+  out1 = -in1;                                                             \
+  out3 = -in3;                                                             \
+  out5 = -in5;                                                             \
+  out7 = -in7;                                                             \
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,        \
+                         r9, r10, r11, r12, r13, r14, r15,          \
+                         out0, out1, out2, out3, out4, out5,        \
+                         out6, out7, out8, out9, out10, out11,      \
+                         out12, out13, out14, out15) {              \
+  v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
+  v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
+  v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
+  v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
+  v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
+                                                                    \
+  /* stage 1 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
+  MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                  \
+          g0_m, g1_m, g2_m, g3_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
+  MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                 \
+          g4_m, g5_m, g6_m, g7_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
+  MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                 \
+          g8_m, g9_m, g10_m, g11_m);                                \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
+  MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                  \
+          g12_m, g13_m, g14_m, g15_m);                              \
+                                                                    \
+  /* stage 2 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
+  MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,          \
+          h0_m, h1_m, h2_m, h3_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
+  MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,         \
+          h4_m, h5_m, h6_m, h7_m);                                  \
+  BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
+  BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
+              h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
+                                                                    \
+  /* stage 3 */                                                     \
+  BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
+  MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,           \
+          out4, out6, out5, out7);                                  \
+  MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,           \
+          out12, out14, out13, out15);                              \
+                                                                    \
+  /* stage 4 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+  k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
+  MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                 \
+  MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                   \
+  MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);               \
+  MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);               \
+}
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                      int32_t dst_stride);
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride);
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
+#endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_

diff --git a/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/libvpx/vpx_dsp/mips/itrans16_dspr2.c
new file mode 100644
index 0000000..6d41e61
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/itrans16_dspr2.c

@@ -0,0 +1,1227 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst16_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+
+#endif  // HAVE_DSPR2

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
similarity index 98%
rename from libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
rename to libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
index 132d88c..553acb0 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c

@@ -8,17 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
-
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
 
 #if HAVE_DSPR2
-void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                    int dest_stride) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
@@ -40,17 +35,17 @@
   int i, temp21;
   uint8_t *dest_pix, *dest_pix1;
   const int const_2_power_13 = 8192;
-  uint8_t *cm = vp9_ff_cropTbl;
+  uint8_t *cm = vpx_ff_cropTbl;
 
-  /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vpx_dsp/mips/itrans32_dspr2.c
similarity index 98%
rename from libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
rename to libvpx/vpx_dsp/mips/itrans32_dspr2.c
index 74a90b0..523da1d 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/libvpx/vpx_dsp/mips/itrans32_dspr2.c

@@ -12,11 +12,8 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
 
 #if HAVE_DSPR2
 static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
@@ -96,8 +93,8 @@
     }
 
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 32));
-    vp9_prefetch_load((const uint8_t *)(input + 48));
+    prefetch_load((const uint8_t *)(input + 32));
+    prefetch_load((const uint8_t *)(input + 48));
 
     __asm__ __volatile__ (
         "lh       %[load1],             2(%[input])                     \n\t"
@@ -868,7 +865,7 @@
   }
 }
 
-void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
                                   int dest_stride) {
   DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
   int16_t *outptr = out;
@@ -885,10 +882,10 @@
   idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+  vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
-void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
                                 int stride) {
   DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
   int16_t *outptr = out;
@@ -947,10 +944,10 @@
   }
 
   // Columns
-  vp9_idct32_cols_add_blk_dspr2(out, dest, stride);
+  vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
-void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
                                int stride) {
   int       r, out;
   int32_t   a1, absa1;

diff --git a/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libvpx/vpx_dsp/mips/itrans4_dspr2.c
new file mode 100644
index 0000000..ecb8bd3
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/itrans4_dspr2.c

@@ -0,0 +1,359 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vpx_idct4_rows_dspr2(input, outptr);
+
+  // Columns
+  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst4_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+#endif  // #if HAVE_DSPR2

diff --git a/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/libvpx/vpx_dsp/mips/itrans8_dspr2.c
new file mode 100644
index 0000000..823e845
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/itrans8_dspr2.c

@@ -0,0 +1,668 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst8_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+#endif  // HAVE_DSPR2

diff --git a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 0000000..b7c9f7b
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c

@@ -0,0 +1,1480 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                 uint8_t *filter48,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 96);
+
+  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    src -= 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1, q2, src, pitch);
+  } else {
+    src -= 7 * pitch;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += pitch;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += pitch;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += pitch;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += pitch;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += pitch;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += pitch;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += pitch;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr,
+                                    int32_t count) {
+  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  (void)count;
+
+  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    vpx_hz_lpf_t16_16w(src, pitch, filter48);
+  }
+}
+
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr,
+                               int32_t count) {
+  if (1 == count) {
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+    if (__msa_test_bz_v(flat)) {
+      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                 q3_r);
+      VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                  q0_filter8);
+      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+      /* store pixel values */
+      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+      /* load 16 vector elements */
+      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__msa_test_bz_v(flat2)) {
+        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+        SD(q1_d, src + pitch);
+        SD(q2_d, src + 2 * pitch);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+                   q7_r);
+
+        tmp0 = p7_r << 3;
+        tmp0 -= p7_r;
+        tmp0 += p6_r;
+        tmp0 += q0_r;
+
+        src -= 7 * pitch;
+
+        /* calculation of p6 and p5 */
+        tmp1 = p6_r + p5_r + p4_r + p3_r;
+        tmp1 += (p2_r + p1_r + p0_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp0 = p5_r - p6_r + q1_r - p7_r;
+        tmp1 += tmp0;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p4 and p3 */
+        tmp0 = p4_r - p5_r + q2_r - p7_r;
+        tmp2 = p3_r - p4_r + q3_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p2 and p1 */
+        tmp0 = p2_r - p3_r + q4_r - p7_r;
+        tmp2 = p1_r - p2_r + q5_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p0 and q0 */
+        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q1 and q2 */
+        tmp0 = q7_r - q0_r + q1_r - p6_r;
+        tmp2 = q7_r - q1_r + q2_r - p5_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q3 and q4 */
+        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q5 and q6 */
+        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+      }
+    }
+  } else {
+    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+                                   thresh_ptr, count);
+  }
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch,
+         p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+  /* 8x8 transpose */
+  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+  /* 8x8 transpose */
+  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+             tmp0, tmp1, tmp2, tmp3);
+  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch,
+                            uint8_t *output, int32_t out_pitch) {
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp2, tmp3;
+
+  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  input += (8 * in_pitch);
+  LD_UB8(input, in_pitch,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p7, p6, p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                uint8_t *src_org, int32_t pitch_org,
+                                const uint8_t *b_limit_ptr,
+                                const uint8_t *limit_ptr,
+                                const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    /* convert 16 bit output data into 8 bit */
+    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                          uint8_t *filter48) {
+  v16i8 zero = { 0 };
+  v16u8 filter8, flat, flat2;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 tmp0_r, tmp1_r;
+  v8i16 r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST8x1_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST8x1_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST8x1_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST8x1_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST8x1_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST8x1_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST8x1_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST8x1_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+                             const uint8_t *b_limit_ptr,
+                             const uint8_t *limit_ptr,
+                             const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+                                       &filter48[0], src, pitch, b_limit_ptr,
+                                       limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                   &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+    }
+  }
+}
+
+int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                 uint8_t *src_org, int32_t pitch,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src_org -= 2;
+    ST4x8_UB(vec2, vec3, src_org, pitch);
+    src_org += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src_org, pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                           uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+                                        &filter48[0], src, pitch, b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                    &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 0000000..daf5f38
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c

@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr,
+                              int32_t count) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr,
+                            int32_t count) {
+  v16u8 mask, hev, flat, limit, thresh, b_limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v8i16 vec0, vec1, vec2, vec3;
+
+  (void)count;
+
+  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                     p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+  src -= 2;
+  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+  src += 4 * pitch;
+  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat;
+  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * pitch), pitch,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+  src -= 2;
+
+  ST4x8_UB(tmp2, tmp3, src, pitch);
+  src += (8 * pitch);
+  ST4x8_UB(tmp4, tmp5, src, pitch);
+}

diff --git a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 0000000..00b6db5
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c

@@ -0,0 +1,348 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr,
+                              int32_t count) {
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+  v16i8 zero = { 0 };
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                q0_filter8);
+    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+    src -= 3 * pitch;
+
+    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+    src += (4 * pitch);
+    SD(q1_d, src);
+    src += pitch;
+    SD(q2_d, src);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *b_limit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  tmp = (v16u8)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  tmp = (v16u8)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  tmp = (v16u8)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    src -= 3 * pitch;
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1_out, q2_out, src, pitch);
+    src += (2 * pitch);
+  }
+}
+
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr,
+                            int32_t count) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4;
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                     p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src -= 3;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 4, src + 4, pitch);
+  }
+}
+
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1,
+                                 const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *temp_src;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+  temp_src = src - 4;
+
+  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+  temp_src += (8 * pitch);
+  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+  /* transpose 16x8 matrix into 8x16 */
+  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                      q3, q2, q1, q0, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  vec0 = (v8i16)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  vec0 = (v8i16)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  vec0 = (v8i16)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src -= 2;
+    ST4x8_UB(vec2, vec3, src, pitch);
+    src += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+    /* filter8 */
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 4, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 4, src + 4, pitch);
+  }
+}

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
similarity index 85%
rename from libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
rename to libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 3df7f4c..99a96d8 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_4_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -50,7 +49,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   /* loop filter designed to work using chars so that we can make maximum use
      of 8 bit simd instructions. */
@@ -88,14 +87,14 @@
           : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
       );
 
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
-                                pm1, p0, p3, p4, p5, p6,
-                                thresh_vec, &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                            pm1, p0, p3, p4, p5, p6,
+                            thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         __asm__ __volatile__ (
             "sw     %[p1],  (%[s1])    \n\t"
@@ -114,7 +113,7 @@
   }
 }
 
-void vp9_lpf_vertical_4_dspr2(unsigned char *s,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -144,7 +143,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -217,14 +216,14 @@
      * mask will be zero and filtering is not needed
      */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
-                                p0, p3, p4, p5, p6, thresh_vec,
-                                &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                            p0, p3, p4, p5, p6, thresh_vec,
+                            &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         /* unpack processed 4x4 neighborhood
          * don't use transpose on output data
@@ -307,56 +306,56 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
                                        1);
 }
 
-void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh) {
-  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // #if HAVE_DSPR2

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
similarity index 85%
rename from libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
rename to libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
index 008cf8c..4a1506b 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+++ b/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h

@@ -13,9 +13,10 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,10 +24,10 @@
 
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
-static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
-                                    uint32_t *ps1, uint32_t *ps0,
-                                    uint32_t *qs0, uint32_t *qs1) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
+                                uint32_t *ps1, uint32_t *ps0,
+                                uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -72,34 +73,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -112,13 +113,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -141,23 +142,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -195,12 +196,12 @@
   *qs1 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
-                                     uint32_t ps1, uint32_t ps0,
-                                     uint32_t qs0, uint32_t qs1,
-                                     uint32_t *p1_f0, uint32_t *p0_f0,
-                                     uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
+                                 uint32_t ps1, uint32_t ps0,
+                                 uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -246,34 +247,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -285,13 +286,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -314,23 +315,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -368,10 +369,10 @@
   *q1_f0 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
-                                      uint32_t *op1, uint32_t *op0,
-                                      uint32_t *oq0, uint32_t *oq1,
-                                      uint32_t *oq2, uint32_t *oq3) {
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                  uint32_t *op1, uint32_t *op0,
+                                  uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
@@ -445,14 +446,14 @@
   *oq2 = res_oq2;
 }
 
-static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
-                                       uint32_t p1, uint32_t p0,
-                                       uint32_t q0, uint32_t q1,
-                                       uint32_t q2, uint32_t q3,
-                                       uint32_t *op2_f1,
-                                       uint32_t *op1_f1, uint32_t *op0_f1,
-                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                       uint32_t *oq2_f1) {
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                   uint32_t p1, uint32_t p0,
+                                   uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3,
+                                   uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   uint32_t  res_op2, res_op1, res_op0;
   uint32_t  res_oq0, res_oq1, res_oq2;
@@ -523,14 +524,14 @@
   *oq2_f1 = res_oq2;
 }
 
-static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
-                                           uint32_t *op5, uint32_t *op4,
-                                           uint32_t *op3, uint32_t *op2,
-                                           uint32_t *op1, uint32_t *op0,
-                                           uint32_t *oq0, uint32_t *oq1,
-                                           uint32_t *oq2, uint32_t *oq3,
-                                           uint32_t *oq4, uint32_t *oq5,
-                                           uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                       uint32_t *op5, uint32_t *op4,
+                                       uint32_t *op3, uint32_t *op2,
+                                       uint32_t *op1, uint32_t *op0,
+                                       uint32_t *oq0, uint32_t *oq1,
+                                       uint32_t *oq2, uint32_t *oq3,
+                                       uint32_t *oq4, uint32_t *oq5,
+                                       uint32_t *oq6, uint32_t *oq7) {
   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
similarity index 99%
rename from libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
rename to libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
index ca01a6a..994ff18 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+++ b/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
similarity index 90%
rename from libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
rename to libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
index 5b0d9cc..e82dfb7 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+++ b/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,13 +24,13 @@
 #if HAVE_DSPR2
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
-static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                             uint32_t p1, uint32_t p0,
-                                             uint32_t p3, uint32_t p2,
-                                             uint32_t q0, uint32_t q1,
-                                             uint32_t q2, uint32_t q3,
-                                             uint32_t thresh, uint32_t *hev,
-                                             uint32_t *mask) {
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0,
+                                         uint32_t p3, uint32_t p2,
+                                         uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
   uint32_t  c, r, r3, r_k;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -129,16 +129,16 @@
   *mask = s2;
 }
 
-static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
-                                                       uint32_t flimit,
-                                                       uint32_t thresh,
-                                                       uint32_t p1, uint32_t p0,
-                                                       uint32_t p3, uint32_t p2,
-                                                       uint32_t q0, uint32_t q1,
-                                                       uint32_t q2, uint32_t q3,
-                                                       uint32_t *hev,
-                                                       uint32_t *mask,
-                                                       uint32_t *flat) {
+static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                   uint32_t flimit,
+                                                   uint32_t thresh,
+                                                   uint32_t p1, uint32_t p0,
+                                                   uint32_t p3, uint32_t p2,
+                                                   uint32_t q0, uint32_t q1,
+                                                   uint32_t q2, uint32_t q3,
+                                                   uint32_t *hev,
+                                                   uint32_t *mask,
+                                                   uint32_t *flat) {
   uint32_t  c, r, r3, r_k, r_flat;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -279,12 +279,12 @@
   *flat = flat1;
 }
 
-static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
-                                 uint32_t p2, uint32_t p1,
-                                 uint32_t p0, uint32_t q0,
-                                 uint32_t q1, uint32_t q2,
-                                 uint32_t q3, uint32_t q4,
-                                 uint32_t *flat2) {
+static INLINE void flatmask5(uint32_t p4, uint32_t p3,
+                             uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0,
+                             uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4,
+                             uint32_t *flat2) {
   uint32_t  c, r, r_k, r_flat;
   uint32_t  ones = 0xFFFFFFFF;
   uint32_t  flat_thresh = 0x01010101;

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
similarity index 90%
rename from libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
rename to libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
index 7cd0b63..4138f56 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_8_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -53,7 +52,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < 2; i++) {
     sp3 = s - (pitch << 2);
@@ -81,13 +80,13 @@
           [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])    \n\t"
@@ -104,13 +103,13 @@
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -130,18 +129,18 @@
       );
     } else if ((flat != 0) && (mask != 0)) {
       /* filtering */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -319,7 +318,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dspr2(unsigned char *s,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -351,7 +350,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -451,39 +450,39 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat != 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
similarity index 90%
rename from libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
rename to libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 6c94674..8a48650 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
                                  int pitch,
                                  const uint8_t *blimit,
                                  const uint8_t *limit,
@@ -58,7 +57,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < (2 * count); i++) {
     sp7 = s - (pitch << 3);
@@ -110,17 +109,17 @@
           [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])            \n\t"
@@ -139,17 +138,17 @@
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
       COMBINE_LEFT_RIGHT_3TO6()
@@ -189,13 +188,13 @@
       /* f1 */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -215,18 +214,18 @@
       );
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -399,36 +398,36 @@
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 + f2 */
       /* f0  function */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* f1  function */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       /* f2  function */
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
similarity index 90%
rename from libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
rename to libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index 851fc6c..e580f01 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_vertical_16_dspr2(uint8_t *s,
+void vpx_lpf_vertical_16_dspr2(uint8_t *s,
                                int pitch,
                                const uint8_t *blimit,
                                const uint8_t *limit,
@@ -55,7 +54,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -248,61 +247,61 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       STORE_F2()
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -466,32 +465,32 @@
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1+f2 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libvpx/vpx_dsp/mips/loopfilter_msa.h
new file mode 100644
index 0000000..62b1706
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/loopfilter_msa.h

@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_LOOPFILTER_MSA_H_
+#define VPX_DSP_LOOPFILTER_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out) {             \
+  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
+  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
+  v8i16 q0_sub_p0_r, filt_r, cnst3h;                                     \
+                                                                         \
+  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
+  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
+  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
+  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
+                                                                         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
+  filt = filt & (v16i8)hev_in;                                           \
+  q0_sub_p0 = q0_m - p0_m;                                               \
+  filt_sign = __msa_clti_s_b(filt, 0);                                   \
+                                                                         \
+  cnst3h = __msa_ldi_h(3);                                               \
+  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
+  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
+  filt_r += q0_sub_p0_r;                                                 \
+  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
+                                                                         \
+  /* combine left and right part */                                      \
+  filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                    \
+                                                                         \
+  filt = filt & (v16i8)mask_in;                                          \
+  cnst4b = __msa_ldi_b(4);                                               \
+  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
+  filt1 >>= 3;                                                           \
+                                                                         \
+  cnst3b = __msa_ldi_b(3);                                               \
+  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
+  filt2 >>= 3;                                                           \
+                                                                         \
+  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
+  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
+  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
+  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
+                                                                         \
+  filt = __msa_srari_b(filt1, 1);                                        \
+  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
+  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
+  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
+  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
+}
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out) {             \
+  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
+  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
+  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;                \
+                                                                         \
+  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
+  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
+  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
+  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
+                                                                         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
+                                                                         \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q0_sub_p0 = q0_m - p0_m;                                               \
+  filt_sign = __msa_clti_s_b(filt, 0);                                   \
+                                                                         \
+  cnst3h = __msa_ldi_h(3);                                               \
+  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
+  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
+  filt_r += q0_sub_p0_r;                                                 \
+  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
+                                                                         \
+  q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);       \
+  filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                         \
+  filt_l += q0_sub_p0_l;                                                 \
+  filt_l = __msa_sat_s_h(filt_l, 7);                                     \
+                                                                         \
+  filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                    \
+  filt = filt & (v16i8)mask_in;                                          \
+                                                                         \
+  cnst4b = __msa_ldi_b(4);                                               \
+  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
+  filt1 >>= 3;                                                           \
+                                                                         \
+  cnst3b = __msa_ldi_b(3);                                               \
+  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
+  filt2 >>= 3;                                                           \
+                                                                         \
+  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
+  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
+  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
+  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
+                                                                         \
+  filt = __msa_srari_b(filt1, 1);                                        \
+  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
+  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
+  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
+  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
+}
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) {  \
+  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;         \
+  v16u8 zero_in = { 0 };                                                 \
+                                                                         \
+  tmp = __msa_ori_b(zero_in, 1);                                         \
+  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                            \
+  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                            \
+  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                            \
+  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                            \
+                                                                         \
+  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);                 \
+  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                       \
+  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);                 \
+  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                       \
+                                                                         \
+  flat_out = (tmp < (v16u8)flat_out);                                    \
+  flat_out = __msa_xori_b(flat_out, 0xff);                               \
+  flat_out = flat_out & (mask);                                          \
+}
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
+                  q5_in, q6_in, q7_in, flat_in, flat2_out) {        \
+  v16u8 tmp, zero_in = { 0 };                                       \
+  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;         \
+  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;         \
+                                                                    \
+  tmp = __msa_ori_b(zero_in, 1);                                    \
+  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                       \
+  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                       \
+  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                       \
+  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                       \
+  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                       \
+  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                       \
+  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                       \
+  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                       \
+                                                                    \
+  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);              \
+  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                \
+  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                \
+  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                \
+                                                                    \
+  flat2_out = (tmp < (v16u8)flat2_out);                             \
+  flat2_out = __msa_xori_b(flat2_out, 0xff);                        \
+  flat2_out = flat2_out & flat_in;                                  \
+}
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                  \
+                    q0_in, q1_in, q2_in, q3_in,                  \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out,    \
+                    q0_filt8_out, q1_filt8_out, q2_filt8_out) {  \
+  v8u16 tmp0, tmp1, tmp2;                                        \
+                                                                 \
+  tmp2 = p2_in + p1_in + p0_in;                                  \
+  tmp0 = p3_in << 1;                                             \
+                                                                 \
+  tmp0 = tmp0 + tmp2 + q0_in;                                    \
+  tmp1 = tmp0 + p3_in + p2_in;                                   \
+  p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = tmp0 + p1_in + q1_in;                                   \
+  p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = q2_in + q1_in + q0_in;                                  \
+  tmp2 = tmp2 + tmp1;                                            \
+  tmp0 = tmp2 + (p0_in);                                         \
+  tmp0 = tmp0 + (p3_in);                                         \
+  p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);           \
+                                                                 \
+  tmp0 = q2_in + q3_in;                                          \
+  tmp0 = p0_in + tmp1 + tmp0;                                    \
+  tmp1 = q3_in + q3_in;                                          \
+  tmp1 = tmp1 + tmp0;                                            \
+  q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp0 = tmp2 + q3_in;                                           \
+  tmp1 = tmp0 + q0_in;                                           \
+  q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = tmp0 - p2_in;                                           \
+  tmp0 = q1_in + q3_in;                                          \
+  tmp1 = tmp0 + tmp1;                                            \
+  q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out, flat_out) {              \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+                                                                 \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = thresh_in < (v16u8)flat_out;                         \
+                                                                 \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m >>= 1;                                            \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+                                                                 \
+  mask_out = b_limit_in < p0_asub_q0_m;                          \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+                                                                 \
+  mask_out = limit_in < (v16u8)mask_out;                         \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */

diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h
new file mode 100644
index 0000000..91e3615
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/macros_msa.h

@@ -0,0 +1,1932 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
+#define VPX_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "lh  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#define LW(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "lw  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m);                                     \
+  val1_m = LW(psrc_m + 4);                                 \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint16_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SW(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SD(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#define LW(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m1);                                    \
+  val1_m = LW(psrc_m1 + 4);                                \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint16_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SW(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SD(val, pdst) {                                     \
+  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  SW(val0_m, pdst_m1);                                      \
+  SW(val1_m, pdst_m1 + 4);                                  \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) {  \
+  out0 = LW((psrc));                                 \
+  out1 = LW((psrc) + stride);                        \
+  out2 = LW((psrc) + 2 * stride);                    \
+  out3 = LW((psrc) + 3 * stride);                    \
+}
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) {  \
+  SW(in0, (pdst))                                \
+  SW(in1, (pdst) + stride);                      \
+  SW(in2, (pdst) + 2 * stride);                  \
+  SW(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) {  \
+  SD(in0, (pdst))                                \
+  SD(in1, (pdst) + stride);                      \
+  SD(in2, (pdst) + 2 * stride);                  \
+  SD(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_B(RTYPE, (psrc));                     \
+  out1 = LD_B(RTYPE, (psrc) + stride);            \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
+  LD_B2(RTYPE, (psrc), stride, out0, out1);             \
+  out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
+}
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
+  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
+  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
+}
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride,                             \
+              out0, out1, out2, out3, out4, out5, out6) {      \
+  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
+  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
+}
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_H(RTYPE, (psrc));                     \
+  out1 = LD_H(RTYPE, (psrc) + (stride));          \
+}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
+  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
+}
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+#define LD_H8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride,                                     \
+               out0, out1, out2, out3, out4, out5, out6, out7,          \
+               out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  LD_H8(RTYPE, (psrc), stride,                                          \
+        out0, out1, out2, out3, out4, out5, out6, out7);                \
+  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
+        out8, out9, out10, out11, out12, out13, out14, out15);          \
+}
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Input   - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
+  out0 = LD_SH(psrc);                                    \
+  out2 = LD_SH(psrc + 8);                                \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
+}
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) {  \
+  out0 = LD_SW((psrc));                     \
+  out1 = LD_SW((psrc) + stride);            \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_B(RTYPE, in0, (pdst));                     \
+  ST_B(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
+              pdst, stride) {                                     \
+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
+  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_H(RTYPE, in0, (pdst));                     \
+  ST_H(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
+  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
+  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
+}
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) {  \
+  ST_SW(in0, (pdst));                     \
+  ST_SW(in1, (pdst) + stride);            \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) {         \
+  uint16_t out0_m, out1_m, out2_m, out3_m;          \
+  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                    \
+  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                    \
+  SH(out0_m, pblk_2x4_m);                           \
+  SH(out1_m, pblk_2x4_m + stride);                  \
+  SH(out2_m, pblk_2x4_m + 2 * stride);              \
+  SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst)
+                 Index 1 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) {        \
+  uint32_t out0_m, out1_m;                  \
+  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
+  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
+                                            \
+  SW(out0_m, pblk_4x2_m);                   \
+  SW(out1_m, pblk_4x2_m + stride);          \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
+  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
+  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
+                                                                    \
+  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
+  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
+  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
+  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
+                                                                    \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
+}
+#define ST4x8_UB(in0, in1, pdst, stride) {                        \
+  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                  \
+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) {              \
+  uint64_t out0_m;                        \
+                                          \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
+  SD(out0_m, pdst);                       \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) {        \
+  uint64_t out0_m, out1_m;                  \
+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                            \
+  SD(out0_m, pblk_8x2_m);                   \
+  SD(out1_m, pblk_8x2_m + stride);          \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) {                  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                            \
+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                            \
+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+   Arguments   : Inputs  - in0, in1, in2, in3,
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector. Then the average
+                 with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
+  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
+}
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
+  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
+}
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
+  v16i8 zero_m = { 0 };                                              \
+  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
+  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
+}
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
+                  out0, out1, out2, out3, slide_val) {  \
+  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
+  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
+}
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
+  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
+  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
+}
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
+                out0, out1, out2, slide_val) {                        \
+  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
+  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
+}
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
+  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
+                out0, out1, out2, out3) {                        \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
+}
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
+}
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
+  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
+  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+}
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of input
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
+  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
+  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
+}
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+                 either vector are copied to the output vector
+   Arguments   : Inputs  - in0, in1, min_vec
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Minimum of unsigned halfword element values from 'in0' and
+                 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
+  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
+  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
+}
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
+  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
+  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
+}
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) ({                          \
+  v8i16 max_m = __msa_ldi_h(255);                     \
+  v8i16 out_m;                                        \
+                                                      \
+  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+  out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1) {  \
+  in0 = CLIP_SH_0_255(in0);         \
+  in1 = CLIP_SH_0_255(in1);         \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
+  CLIP_SH2_0_255(in0, in1);                   \
+  CLIP_SH2_0_255(in2, in3);                   \
+}
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in) ({                        \
+  v2i64 res0_m, res1_m;                           \
+  int32_t sum_m;                                  \
+                                                  \
+  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+  res1_m = __msa_splati_d(res0_m, 1);             \
+  res0_m = res0_m + res1_m;                       \
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+  sum_m;                                          \
+})
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) ({                           \
+  v4u32 res_m;                                       \
+  v2u64 res0_m, res1_m;                              \
+  uint32_t sum_m;                                    \
+                                                     \
+  res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
+  res0_m = __msa_hadd_u_d(res_m, res_m);             \
+  res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
+  res0_m = res0_m + res1_m;                          \
+  sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
+  sum_m;                                             \
+})
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1
+                 Outputs - sad_m                 (halfword vector)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+                 pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
+  v16u8 diff0_m, diff1_m;                                   \
+  v8u16 sad_m = { 0 };                                      \
+                                                            \
+  diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
+  diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
+                                                            \
+  sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
+  sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
+                                                            \
+  sad_m;                                                    \
+})
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
+  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
+}
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
+}
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
+}
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                in8, in9, in10, in11, in12, in13, in14, in15,      \
+                out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
+          out0, out1, out2, out3);                                 \
+  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
+          out4, out5, out6, out7);                                 \
+}
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
+  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
+}
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
+}
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
+  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
+  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
+}
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
+                  out0, out1, out2, out3) {           \
+  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
+  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
+}
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
+  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
+}
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) {         \
+  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
+  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
+  XORI_B2_128(RTYPE, in0, in1);                \
+  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
+  XORI_B2_128(RTYPE, in0, in1);                   \
+  XORI_B2_128(RTYPE, in2, in3);                   \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
+  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
+  XORI_B3_128(RTYPE, in4, in5, in6);                             \
+}
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each signed halfword element from 'in0' is added to each
+                 signed halfword element of 'in1' with full precision resulting
+                 in one extra bit in the result. The result is then divided by
+                 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
+  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
+  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
+  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
+}
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
+  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
+}
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 << shift;                         \
+  in1 = in1 << shift;                         \
+  in2 = in2 << shift;                         \
+  in3 = in3 << shift;                         \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 >> shift;                        \
+  in1 = in1 >> shift;                        \
+  in2 = in2 >> shift;                        \
+  in3 = in3 >> shift;                        \
+}
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) {               \
+  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRAR_W2(RTYPE, in0, in1, shift)                    \
+  SRAR_W2(RTYPE, in2, in3, shift)                    \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_H2(RTYPE, in0, in1, shift);                   \
+  SRARI_H2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                   \
+  SRARI_W2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
+  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
+  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
+  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
+  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
+}
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 * in1;                             \
+  out1 = in2 * in3;                             \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  MUL2(in0, in1, in2, in3, out0, out1);               \
+  MUL2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 + in1;                             \
+  out1 = in2 + in3;                             \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  ADD2(in0, in1, in2, in3, out0, out1);               \
+  ADD2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 - in1;                             \
+  out1 = in2 - in3;                             \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  out0 = in0 - in1;                                   \
+  out1 = in2 - in3;                                   \
+  out2 = in4 - in5;                                   \
+  out3 = in6 - in7;                                   \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) {                 \
+  v8i16 sign_m;                                  \
+                                                 \
+  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) {   \
+  v16i8 zero_m = { 0 };                 \
+                                        \
+  ILVRL_B2_SH(zero_m, in, out0, out1);  \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) {    \
+  v8i16 tmp_m;                           \
+                                         \
+  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  out0 = in0 + in3;                                                \
+  out1 = in1 + in2;                                                \
+                                                                   \
+  out2 = in1 - in2;                                                \
+  out3 = in0 - in3;                                                \
+}
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  out0 = in0 + in7;                                                    \
+  out1 = in1 + in6;                                                    \
+  out2 = in2 + in5;                                                    \
+  out3 = in3 + in4;                                                    \
+                                                                       \
+  out4 = in3 - in4;                                                    \
+  out5 = in2 - in5;                                                    \
+  out6 = in1 - in6;                                                    \
+  out7 = in0 - in7;                                                    \
+}
+
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7,          \
+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  out0 = in0 + in15;                                                          \
+  out1 = in1 + in14;                                                          \
+  out2 = in2 + in13;                                                          \
+  out3 = in3 + in12;                                                          \
+  out4 = in4 + in11;                                                          \
+  out5 = in5 + in10;                                                          \
+  out6 = in6 + in9;                                                           \
+  out7 = in7 + in8;                                                           \
+                                                                              \
+  out8 = in7 - in8;                                                           \
+  out9 = in6 - in9;                                                           \
+  out10 = in5 - in10;                                                         \
+  out11 = in4 - in11;                                                         \
+  out12 = in3 - in12;                                                         \
+  out13 = in2 - in13;                                                         \
+  out14 = in1 - in14;                                                         \
+  out15 = in0 - in15;                                                         \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                           \
+  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
+             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
+  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
+  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
+  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
+  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
+  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
+  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                            in8, in9, in10, in11, in12, in13, in14, in15,      \
+                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
+  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
+                                                                               \
+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
+                                                                               \
+  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
+  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
+  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
+  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
+  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
+  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
+  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
+  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
+                                                                               \
+  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
+  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
+  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
+  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 s0_m, s1_m;                                                       \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
+  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
+}
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
+  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
+  v8i16 zero_m = { 0 };                                                       \
+                                                                              \
+  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
+             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
+  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
+  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
+                                                                              \
+  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+                                                                              \
+  out4 = zero_m;                                                              \
+  out5 = zero_m;                                                              \
+  out6 = zero_m;                                                              \
+  out7 = zero_m;                                                              \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
+  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
+  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
+  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
+}
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m;                                                       \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
+           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
+  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+}
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
+                                                                          \
+  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
+  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
+                                                                          \
+  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
+  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
+}
+
+/* Description : Add block 4x4
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
+  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
+  v16i8 dst0_m = { 0 };                                         \
+  v16i8 dst1_m = { 0 };                                         \
+  v16i8 zero_m = { 0 };                                         \
+                                                                \
+  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
+  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
+  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
+  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
+  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
+  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
+  CLIP_SH2_0_255(res0_m, res1_m);                               \
+  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
+}
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) ({                    \
+  v16u8 out_m;                                           \
+                                                         \
+  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
+  out_m;                                                 \
+})
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+                 as 8x4 unsigned byte block
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
+                                dst0, dst1, dst2, dst3, pdst, stride) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
+                                                                         \
+  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
+  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
+  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
+}
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) {             \
+  v16i8 tmp_m;                                    \
+                                                  \
+  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  ST_SB(tmp_m, (pdst));                           \
+}
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
+  v16i8 tmp0_m;                                                \
+  v8u16 tmp1_m;                                                \
+                                                               \
+  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
+  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
+  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
+                                                               \
+  tmp1_m;                                                      \
+})
+#endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */

diff --git a/libvpx/vpx_dsp/mips/sad_msa.c b/libvpx/vpx_dsp/mips/sad_msa.c
new file mode 100644
index 0000000..3bdec28
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/sad_msa.c

@@ -0,0 +1,1525 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
+}
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  uint32_t sad = 0;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = HADD_UH_U32(sad0);
+  sad += HADD_UH_U32(sad1);
+
+  return sad;
+}
+
+static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref, ref0, ref1, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
+    ref0_4 = LD_UB(ref + 64);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32((v4i32)sad);
+}
+
+static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref0, ref1, ref;
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1;
+  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  const uint8_t *src_dup, *ref_dup;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+  v4u32 sad;
+
+  src_dup = src;
+  ref_dup = ref;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[3] = HADD_SW_S32(sad);
+
+  sad0_0 = (v8u16)__msa_ldi_h(0);
+  sad0_1 = (v8u16)__msa_ldi_h(0);
+  sad1_0 = (v8u16)__msa_ldi_h(0);
+  sad1_1 = (v8u16)__msa_ldi_h(0);
+  sad2_0 = (v8u16)__msa_ldi_h(0);
+  sad2_1 = (v8u16)__msa_ldi_h(0);
+  sad3_0 = (v8u16)__msa_ldi_h(0);
+  sad3_1 = (v8u16)__msa_ldi_h(0);
+
+  for (ht_cnt = 64; ht_cnt--;) {
+    LD_UB4(src_dup, 16, src0, src1, src2, src3);
+    src_dup += src_stride;
+    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref_dup += ref_stride;
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[4] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[5] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[6] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[7] = HADD_SW_S32(sad);
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t * const aref_ptr[],
+                               int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+
+    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref0_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref1_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref2_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref3_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t * const aref_ptr[],
+                               int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref0_ptr += (4 * ref_stride);
+    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+    ref1_ptr += (4 * ref_stride);
+    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+    ref2_ptr += (4 * ref_stride);
+    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+    ref3_ptr += (4 * ref_stride);
+
+    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src, ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+
+    LD_UB2(ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+
+    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0_0);
+  sad_array[0] += HADD_UH_U32(sad0_1);
+  sad_array[1] = HADD_UH_U32(sad1_0);
+  sad_array[1] += HADD_UH_U32(sad1_1);
+  sad_array[2] = HADD_UH_U32(sad2_0);
+  sad_array[2] += HADD_UH_U32(sad2_1);
+  sad_array[3] = HADD_UH_U32(sad3_0);
+  sad_array[3] += HADD_UH_U32(sad3_1);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                  const uint8_t *ref_ptr, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff, pred, comp;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    comp = __msa_aver_u_b(pred, ref);
+    diff = __msa_asub_u_b(src, comp);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+                                  const uint8_t *ref, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 diff0, diff1, pred0, pred1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 3); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+    ref += (4 * ref_stride);
+
+    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+    sec_pred += (4 * 32);
+
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 comp0, comp1, comp2, comp3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+  }
+
+  sad = __msa_hadd_u_w(sad0, sad0);
+  sad += __msa_hadd_u_w(sad1, sad1);
+
+  return HADD_SW_S32(sad);
+}
+
+#define VPX_SAD_4xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                 const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_4width_msa(src, src_stride,  ref, ref_stride, height);         \
+}
+
+#define VPX_SAD_8xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                 const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_8width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_16xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_16width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_32xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_32width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_64xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
+void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
+void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
+void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
+void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
+void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
+void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
+void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
+void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
+void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
+void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *const refs[],             \
+                                int32_t ref_stride, uint32_t *sads) {    \
+  sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *const refs[],             \
+                                int32_t ref_stride, uint32_t *sads) {    \
+  sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                     const uint8_t *ref, int32_t ref_stride,  \
+                                     const uint8_t *second_pred) {            \
+  return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
+                           height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                     const uint8_t *ref, int32_t ref_stride,  \
+                                     const uint8_t *second_pred) {            \
+  return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
+                           height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+// 64x64
+VPX_SAD_64xHEIGHT_MSA(64);
+VPX_SAD_64xHEIGHTx3_MSA(64);
+VPX_SAD_64xHEIGHTx8_MSA(64);
+VPX_SAD_64xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_64xHEIGHT_MSA(64);
+
+// 64x32
+VPX_SAD_64xHEIGHT_MSA(32);
+VPX_SAD_64xHEIGHTx3_MSA(32);
+VPX_SAD_64xHEIGHTx8_MSA(32);
+VPX_SAD_64xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_64xHEIGHT_MSA(32);
+
+// 32x64
+VPX_SAD_32xHEIGHT_MSA(64);
+VPX_SAD_32xHEIGHTx3_MSA(64);
+VPX_SAD_32xHEIGHTx8_MSA(64);
+VPX_SAD_32xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_32xHEIGHT_MSA(64);
+
+// 32x32
+VPX_SAD_32xHEIGHT_MSA(32);
+VPX_SAD_32xHEIGHTx3_MSA(32);
+VPX_SAD_32xHEIGHTx8_MSA(32);
+VPX_SAD_32xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_32xHEIGHT_MSA(32);
+
+// 32x16
+VPX_SAD_32xHEIGHT_MSA(16);
+VPX_SAD_32xHEIGHTx3_MSA(16);
+VPX_SAD_32xHEIGHTx8_MSA(16);
+VPX_SAD_32xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_32xHEIGHT_MSA(16);
+
+// 16x32
+VPX_SAD_16xHEIGHT_MSA(32);
+VPX_SAD_16xHEIGHTx3_MSA(32);
+VPX_SAD_16xHEIGHTx8_MSA(32);
+VPX_SAD_16xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_16xHEIGHT_MSA(32);
+
+// 16x16
+VPX_SAD_16xHEIGHT_MSA(16);
+VPX_SAD_16xHEIGHTx3_MSA(16);
+VPX_SAD_16xHEIGHTx8_MSA(16);
+VPX_SAD_16xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_16xHEIGHT_MSA(16);
+
+// 16x8
+VPX_SAD_16xHEIGHT_MSA(8);
+VPX_SAD_16xHEIGHTx3_MSA(8);
+VPX_SAD_16xHEIGHTx8_MSA(8);
+VPX_SAD_16xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_16xHEIGHT_MSA(8);
+
+// 8x16
+VPX_SAD_8xHEIGHT_MSA(16);
+VPX_SAD_8xHEIGHTx3_MSA(16);
+VPX_SAD_8xHEIGHTx8_MSA(16);
+VPX_SAD_8xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_8xHEIGHT_MSA(16);
+
+// 8x8
+VPX_SAD_8xHEIGHT_MSA(8);
+VPX_SAD_8xHEIGHTx3_MSA(8);
+VPX_SAD_8xHEIGHTx8_MSA(8);
+VPX_SAD_8xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_8xHEIGHT_MSA(8);
+
+// 8x4
+VPX_SAD_8xHEIGHT_MSA(4);
+VPX_SAD_8xHEIGHTx3_MSA(4);
+VPX_SAD_8xHEIGHTx8_MSA(4);
+VPX_SAD_8xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_8xHEIGHT_MSA(4);
+
+// 4x8
+VPX_SAD_4xHEIGHT_MSA(8);
+VPX_SAD_4xHEIGHTx3_MSA(8);
+VPX_SAD_4xHEIGHTx8_MSA(8);
+VPX_SAD_4xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_4xHEIGHT_MSA(8);
+
+// 4x4
+VPX_SAD_4xHEIGHT_MSA(4);
+VPX_SAD_4xHEIGHTx3_MSA(4);
+VPX_SAD_4xHEIGHTx8_MSA(4);
+VPX_SAD_4xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_4xHEIGHT_MSA(4);

diff --git a/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 0000000..a592a2d
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c

@@ -0,0 +1,1952 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+  { 128,   0, },
+  { 112,  16, },
+  {  96,  32, },
+  {  80,  48, },
+  {  64,  64, },
+  {  48,  80, },
+  {  32,  96, },
+  {  16, 112, },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+                                                                   \
+  sub += res_l0_m + res_l1_m;                                      \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 pred, src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref, pred;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    CALC_MSE_AVG_B(src0, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 filt0, out, ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4, out;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 ref = { 0 };
+  v16u8 src2110, src4332;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out, ref = { 0 };
+  v16u8 filt_vt, filt_hz, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt_vt, filt_hz, vec0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 out, pred, filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 out, pred, filt0;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 pred0, pred1, pred2, pred3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
+                tmp0, tmp1, tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
+                tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 out, pred, ref = { 0 };
+  v16u8 src2110, src4332, filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, filt0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 out, pred, ref = { 0 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 pred0, pred1, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              const uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const uint8_t *sec_pred,
+                                              const uint8_t *filter_horiz,
+                                              const uint8_t *filter_vert,
+                                              int32_t height,
+                                              int32_t *diff,
+                                              int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 out0, out1, out2, out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                       sec_pred, filter_horiz, filter_vert,
+                                       height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
+                                                 int32_t src_stride,     \
+                                                 int32_t xoffset,        \
+                                                 int32_t yoffset,        \
+                                                 const uint8_t *ref,     \
+                                                 int32_t ref_stride,     \
+                                                 uint32_t *sse) {        \
+  int32_t diff;                                                          \
+  uint32_t var;                                                          \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
+                                                                         \
+  if (yoffset) {                                                         \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
+                                                   ref, ref_stride,      \
+                                                   h_filter, v_filter,   \
+                                                   ht, &diff);           \
+    } else {                                                             \
+      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  v_filter, ht, &diff);  \
+    }                                                                    \
+                                                                         \
+    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
+  } else {                                                               \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  h_filter, ht, &diff);  \
+                                                                         \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
+    } else {                                                             \
+      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
+                                          ref, ref_stride, sse);         \
+    }                                                                    \
+  }                                                                      \
+                                                                         \
+  return var;                                                            \
+}
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
+uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
+  const uint8_t *src_ptr, int32_t src_stride,                                 \
+  int32_t xoffset, int32_t yoffset,                                           \
+  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
+  uint32_t *sse, const uint8_t *sec_pred) {                                   \
+  int32_t diff;                                                               \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
+                                                                              \
+  if (yoffset) {                                                              \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
+                                                       ref_ptr, ref_stride,   \
+                                                       sec_pred, h_filter,    \
+                                                       v_filter, ht, &diff);  \
+    } else {                                                                  \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, v_filter,     \
+                                                      ht, &diff);             \
+    }                                                                         \
+  } else {                                                                    \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, h_filter,     \
+                                                      ht, &diff);             \
+    } else {                                                                  \
+      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
+                                          ref_ptr, ref_stride,                \
+                                          sec_pred, ht, &diff);               \
+    }                                                                         \
+  }                                                                           \
+                                                                              \
+  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+                                             int32_t src_stride,
+                                             int32_t xoffset,
+                                             int32_t yoffset,
+                                             const uint8_t *ref_ptr,
+                                             int32_t ref_stride,
+                                             uint32_t *sse,
+                                             const uint8_t *sec_pred) {
+  int32_t diff;
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+
+  if (yoffset) {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
+                                                   ref_ptr, ref_stride,
+                                                   sec_pred, h_filter,
+                                                   v_filter, 64, &diff);
+    } else {
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, v_filter,
+                                                  64, &diff);
+    }
+  } else {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, h_filter,
+                                                  64, &diff);
+    } else {
+      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    sec_pred, &diff);
+    }
+  }
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
+uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
+                                                 int32_t src_stride,         \
+                                                 int32_t xoffset,            \
+                                                 int32_t yoffset,            \
+                                                 const uint8_t *ref_ptr,     \
+                                                 int32_t ref_stride,         \
+                                                 uint32_t *sse,              \
+                                                 const uint8_t *sec_pred) {  \
+  int32_t diff;                                                              \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
+                                                                             \
+  if (yoffset) {                                                             \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
+                                                   ref_ptr, ref_stride,      \
+                                                   sec_pred, h_filter,       \
+                                                   v_filter, ht, &diff);     \
+    } else {                                                                 \
+      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, v_filter,        \
+                                                  ht, &diff);                \
+    }                                                                        \
+  } else {                                                                   \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, h_filter,        \
+                                                  ht, &diff);                \
+    } else {                                                                 \
+      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
+                                        ref_ptr, ref_stride,                 \
+                                        sec_pred, &diff);                    \
+    }                                                                        \
+  }                                                                          \
+                                                                             \
+  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);

diff --git a/libvpx/vpx_dsp/mips/subtract_msa.c b/libvpx/vpx_dsp/mips/subtract_msa.c
new file mode 100644
index 0000000..9ac43c5
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/subtract_msa.c

@@ -0,0 +1,264 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t pred0, pred1, pred2, pred3;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+  ILVRL_B2_UB(src, pred, src_l0, src_l1);
+  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  uint64_t src0, src1, pred0, pred1;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    LD2(src_ptr, src_stride, src0, src1);
+    src_ptr += (2 * src_stride);
+    LD2(pred_ptr, pred_stride, pred0, pred1);
+    pred_ptr += (2 * pred_stride);
+
+    INSERT_D2_SB(src0, src1, src);
+    INSERT_D2_SB(pred0, pred1, pred);
+    ILVRL_B2_UB(src, pred, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+    diff_ptr += (2 * diff_stride);
+  }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  int8_t count;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (count = 2; count--;) {
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    LD_SB8(pred, pred_stride,
+           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+    pred += (8 * pred_stride);
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    LD_SB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_SB2(src, 16, src2, src3);
+    src += src_stride;
+    LD_SB2(src, 16, src4, src5);
+    src += src_stride;
+    LD_SB2(src, 16, src6, src7);
+    src += src_stride;
+
+    LD_SB2(pred, 16, pred0, pred1);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred2, pred3);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred4, pred5);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_SB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+
+    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+    pred += pred_stride;
+    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_msa(int32_t rows, int32_t cols,
+                            int16_t *diff_ptr, ptrdiff_t diff_stride,
+                            const uint8_t *src_ptr, ptrdiff_t src_stride,
+                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                        diff_ptr, diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                        diff_ptr, diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libvpx/vpx_dsp/mips/txfm_macros_msa.h
new file mode 100644
index 0000000..68c63d5
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/txfm_macros_msa.h

@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {      \
+  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                     \
+  s0_m = (v4i32)__msa_fill_h(cnst1);                                 \
+  k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                           \
+                                                                     \
+  ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                            \
+  ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                               \
+  DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+                                                                     \
+  DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+}
+
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,      \
+                              dst0, dst1, dst2, dst3) {                    \
+  v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                 \
+  v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                 \
+                                                                           \
+  DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                      \
+              tp0_m, tp2_m, tp3_m, tp4_m);                                 \
+  DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                      \
+              tp5_m, tp6_m, tp7_m, tp8_m);                                 \
+  BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);     \
+  BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);     \
+  SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                 \
+  SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                 \
+  PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,      \
+              dst0, dst1, dst2, dst3);                                     \
+}
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({       \
+  v8i16 dst_m;                                        \
+  v4i32 tp0_m, tp1_m;                                 \
+                                                      \
+  DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);      \
+  SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);          \
+  dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m);  \
+                                                      \
+  dst_m;                                              \
+})
+
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) {                    \
+  v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                         \
+  v8i16 madd_s0_m, madd_s1_m;                                       \
+                                                                    \
+  ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,           \
+              c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);  \
+  SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);  \
+  PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);      \
+}
+
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,         \
+                out0, out1, out2, out3) {                               \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                        \
+  ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+  ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+#endif  // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_

diff --git a/libvpx/vpx_dsp/mips/variance_msa.c b/libvpx/vpx_dsp/mips/variance_msa.c
new file mode 100644
index 0000000..33e1755
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/variance_msa.c

@@ -0,0 +1,633 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var) {                                \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+}
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+                                                                   \
+  sub += res_l0_m + res_l1_m;                                      \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  int32_t ht_cnt;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+  uint32_t sum, cnt;
+  v8i16 src0, src1, src2, src3;
+  v4i32 src0_l, src1_l, src2_l, src3_l;
+  v4i32 src0_r, src1_r, src2_r, src3_r;
+  v2i64 sq_src_l = { 0 };
+  v2i64 sq_src_r = { 0 };
+
+  for (cnt = 8; cnt--;) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    src += 4 * 8;
+
+    UNPCK_SH_SW(src0, src0_l, src0_r);
+    UNPCK_SH_SW(src1, src1_l, src1_r);
+    UNPCK_SH_SW(src2, src2_l, src2_r);
+    UNPCK_SH_SW(src3, src3_l, src3_r);
+
+    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+  }
+
+  sq_src_l += __msa_splati_d(sq_src_l, 1);
+  sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+  sum = __msa_copy_s_d(sq_src_l, 0);
+  sum += __msa_copy_s_d(sq_src_r, 0);
+
+  return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride) {
+  uint32_t err = 0;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16i8 src = { 0 };
+  v16i8 ref = { 0 };
+  v16u8 src_vec0, src_vec1;
+  v8i16 diff0, diff1;
+  v4i32 err0 = { 0 };
+  v4i32 err1 = { 0 };
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
+  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
+  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
+  err = HADD_SW_S32(err0);
+  err += HADD_SW_S32(err1);
+
+  return err;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_VARIANCE_WDXHT_MSA(wd, ht)                               \
+uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src,           \
+                                       int32_t src_stride,           \
+                                       const uint8_t *ref,           \
+                                       int32_t ref_stride,           \
+                                       uint32_t *sse) {              \
+  int32_t diff;                                                      \
+                                                                     \
+  *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride,  \
+                                  ht, &diff);                        \
+                                                                     \
+  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                       \
+}
+
+VPX_VARIANCE_WDXHT_MSA(4, 4);
+VPX_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_VARIANCE_WDXHT_MSA(8, 4)
+VPX_VARIANCE_WDXHT_MSA(8, 8)
+VPX_VARIANCE_WDXHT_MSA(8, 16)
+
+VPX_VARIANCE_WDXHT_MSA(16, 8)
+VPX_VARIANCE_WDXHT_MSA(16, 16)
+VPX_VARIANCE_WDXHT_MSA(16, 32)
+
+VPX_VARIANCE_WDXHT_MSA(32, 16)
+VPX_VARIANCE_WDXHT_MSA(32, 32)
+
+uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+                        const uint8_t *ref, int32_t ref_stride,
+                        uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+                       const uint8_t *ref, int32_t ref_stride,
+                       uint32_t *sse, int32_t *sum) {
+  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse, int32_t *sum) {
+  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
+  return get_mb_ss_msa(src);
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
new file mode 100644
index 0000000..f6244d8
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c

@@ -0,0 +1,743 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst0, dst1, dst2, dst3, res2, res3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, res0, res1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, res0, res1);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  SRARI_H2_SH(res0, res1, FILTER_BITS);
+  SAT_SH2_SH(res0, res1, 7);
+  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  XORI_B2_128_UB(res2, res3);
+  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8i16 filt, vec0, vec1, vec2, vec3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, vec0, vec1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, vec2, vec3);
+  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
+              res3);
+  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+  XORI_B2_128_UB(res0, res2);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+             dst6);
+  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+  ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    src += (2 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+               vec14);
+    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+               vec15);
+    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                vec9, vec10, vec11);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+                 vec2, vec3);
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                 vec9, vec10, vec11);
+    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                out2, out3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+               vec14);
+    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+               vec15);
+    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                vec9, vec10, vec11);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+                 vec2, vec3);
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                 vec9, vec10, vec11);
+    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    LD_UB2(dst, 16, dst1, dst2);
+    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    for (cnt = 0; cnt < 2; ++cnt) {
+      src0 = LD_SB(&src[cnt << 5]);
+      src2 = LD_SB(&src[16 + (cnt << 5)]);
+      src3 = LD_SB(&src[24 + (cnt << 5)]);
+      src1 = __msa_sldi_b(src2, src0, 8);
+
+      XORI_B4_128_SB(src0, src1, src2, src3);
+      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                 vec12);
+      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                 vec13);
+      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                 vec14);
+      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                 vec15);
+      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                  vec1, vec2, vec3);
+      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                  vec9, vec10, vec11);
+      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                   vec1, vec2, vec3);
+      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                   vec9, vec10, vec11);
+      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                  out2, out3);
+      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+      SAT_SH4_SH(out0, out1, out2, out3, 7);
+      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+    }
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+             dst6);
+  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+              res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter,
+                                                  int32_t height) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+              res2, res3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+              res6, res7);
+  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+  dst += dst_stride;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+    LD_UB2(dst, 16, dst0, dst1);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+    dst += dst_stride;
+    LD_UB2(dst, 16, dst2, dst3);
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src2, src4, src6);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4, filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4, filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
new file mode 100644
index 0000000..2abde6d
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c

@@ -0,0 +1,661 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+    SRARI_H2_SH(res0, res1, FILTER_BITS);
+    SAT_SH2_SH(res0, res1, 7);
+    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+    XORI_B2_128_UB(tmp0, tmp1);
+    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out5 = hz_out9;
+    vec0 = vec2;
+    vec1 = vec3;
+    vec2 = vec4;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out6 = hz_out10;
+    out0 = out2;
+    out1 = out3;
+    out2 = out8;
+    out4 = out6;
+    out5 = out7;
+    out6 = out9;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 dst0, dst1, dst2, dst3, res0, res1;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+             hz_out3, hz_out5, 8);
+  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+             dst4, dst6);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
+              res2, res3);
+  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (8 == height) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int8_t *filter_horiz,
+                                                       int8_t *filter_vert,
+                                                       int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_SB(src);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert,
+                                               height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0 &&
+      ((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, x_step_q4, filter_y, y_step_q4,
+                            w, h);
+        break;
+    }
+  } else if (((const int32_t *)filter_x)[0] == 0 ||
+             ((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              filt_hor, filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              filt_hor, filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, x_step_q4, filter_y, y_step_q4,
+                            w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
new file mode 100644
index 0000000..0164e41
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c

@@ -0,0 +1,718 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3, out;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                               filt1, filt2, filt3);
+    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                               filt1, filt2, filt3);
+    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                               filt1, filt2, filt3);
+    out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                               filt1, filt2, filt3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter,
+                                                   int32_t height,
+                                                   int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+
+      LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
+                  dst2, dst3);
+      ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  v16i8 src10_r, src32_r, src21_r, src43_r;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  src4 = LD_SB(src);
+  src += src_stride;
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  out = __msa_aver_u_b(out, dst0);
+
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  v16u8 src2110, src4332, src6554, src8776, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+             dst2, dst3);
+  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+               vec2, vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+               vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_UB2(src, 16, src0, src5);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5;
+  v16u8 src6, src7, src8, src9, src10, src11, filt0;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(dst + 16, dst_stride, dst2, dst3);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(dst + 32, dst_stride, dst4, dst5);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    LD_UB2(dst + 48, dst_stride, dst6, dst7);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4,
+                                int w, int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
new file mode 100644
index 0000000..dbd120b
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c

@@ -0,0 +1,703 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  SRARI_H2_SH(out0, out1, FILTER_BITS);
+  SAT_SH2_SH(out0, out1, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out2, out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  out = PCKEV_XORI128_UB(out2, out3);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1, out2,
+                             out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  tmp0 = PCKEV_XORI128_UB(out0, out1);
+  tmp1 = PCKEV_XORI128_UB(out2, out3);
+  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (2 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    dst += dst_stride;
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+
+    src0 = LD_SB(src + 32);
+    src2 = LD_SB(src + 48);
+    src3 = LD_SB(src + 56);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst + 32);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16i8 res0, res1, res2, res3;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+  ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask, out0, out1;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+  }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  loop_cnt = (height >> 2) - 1;
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+              out2, out3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+              out6, out7);
+  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+  PCKEV_ST_SB(out0, out1, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out2, out3, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out4, out5, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out6, out7, dst);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    PCKEV_ST_SB(out6, out7, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src4 = LD_SB(src + 32);
+    src6 = LD_SB(src + 48);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst + 32);
+    PCKEV_ST_SB(out6, out7, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
new file mode 100644
index 0000000..7546f13
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c

@@ -0,0 +1,635 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out5 = hz_out9;
+    out0 = out2;
+    out1 = out3;
+    out2 = out4;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src7, src8, src9, src10);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    ST8x4_UB(vec0, vec1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out6 = hz_out10;
+    out0 = out2;
+    out1 = out3;
+    out2 = out8;
+    out4 = out6;
+    out5 = out7;
+    out6 = out9;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  v16i8 res0, res1, res2, res3;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+             hz_out3, hz_out5, 8);
+  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+              vec4, vec5, vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (8 == height) {
+    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert,
+                                          int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_SB(src);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *filter_x, int32_t x_step_q4,
+                       const int16_t *filter_y, int32_t y_step_q4,
+                       int32_t w, int32_t h) {
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0 &&
+      ((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  } else if (((const int32_t *)filter_x)[0] == 0 ||
+             ((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4,
+                    w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
new file mode 100644
index 0000000..527d457
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c

@@ -0,0 +1,710 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v16u8 out;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v16u8 tmp0, tmp1;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+             src54_l, src21_l);
+  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+               src87_l, src98_l, src109_l);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                 filt1, filt2, filt3);
+    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                 filt1, filt2, filt3);
+    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                 filt1, filt2, filt3);
+    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src54_l = src98_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src65_l = src109_l;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+               src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+               src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+  v16u8 filt0;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+
+  src8 = LD_SB(src);
+  src += src_stride;
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+               vec2, vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+               vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src5 = LD_UB(src + 16);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
new file mode 100644
index 0000000..4c3d978
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c

@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint32_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  if (0 == (height % 4)) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                  dst0, dst1, dst2, dst3);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+      out2 = __msa_copy_u_w((v4i32)dst2, 0);
+      out3 = __msa_copy_u_w((v4i32)dst3, 0);
+      SW4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == (height % 2)) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+
+      LD_UB2(dst, dst_stride, dst0, dst1);
+
+      AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+      SW(out0, dst);
+      dst += dst_stride;
+      SW(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (cnt = (height / 4); cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+
+    out0 = __msa_copy_u_d((v2i64)dst0, 0);
+    out1 = __msa_copy_u_d((v2i64)dst1, 0);
+    out2 = __msa_copy_u_d((v2i64)dst2, 0);
+    out3 = __msa_copy_u_d((v2i64)dst3, 0);
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  for (cnt = (height / 8); cnt--;) {
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+    dst += (8 * dst_stride);
+  }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 8); cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+    LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+    dst_dup += (4 * dst_stride);
+    LD_UB4(src, src_stride, src8, src10, src12, src14);
+    LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+    dst_dup += (4 * dst_stride);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
+
+    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
+    ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+    ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 4); cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+    LD_UB4(src, 16, src8, src9, src10, src11);
+    src += src_stride;
+    LD_UB4(src, 16, src12, src13, src14, src15);
+    src += src_stride;
+
+    LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+    dst_dup += dst_stride;
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
+
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int32_t filter_x_stride,
+                          const int16_t *filter_y, int32_t filter_y_stride,
+                          int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      avg_width4_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 8: {
+      avg_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
new file mode 100644
index 0000000..ba40122
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c

@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    for (cnt = height >> 3; cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 4) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 2) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+      SD(out0, dst);
+      dst += dst_stride;
+      SD(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      LD_UB8(src_tmp, src_stride,
+             src0, src1, src2, src3, src4, src5, src6, src7);
+      src_tmp += (8 * src_stride);
+
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+             dst_tmp, dst_stride);
+      dst_tmp += (8 * dst_stride);
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+      dst += (8 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int32_t filter_x_stride,
+                           const int16_t *filter_y, int32_t filter_y_stride,
+                           int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt, tmp;
+      /* 1 word storage */
+      for (cnt = h; cnt--;) {
+        tmp = LW(src);
+        SW(tmp, dst);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}

diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
new file mode 100644
index 0000000..e001398
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h

@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
+                            filt0, filt1, filt2, filt3) ({  \
+  v8i16 tmp0, tmp1;                                         \
+                                                            \
+  tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
+  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
+  tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);         \
+  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);  \
+  tmp0 = __msa_adds_s_h(tmp0, tmp1);                        \
+                                                            \
+  tmp0;                                                     \
+})
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
+                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+  v8i16 hz_out_m;                                                      \
+                                                                       \
+  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+             vec0_m, vec1_m, vec2_m, vec3_m);                          \
+  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                       \
+  hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                     \
+  hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                       \
+  hz_out_m;                                                            \
+})
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
+                                   mask0, mask1, mask2, mask3,           \
+                                   filt0, filt1, filt2, filt3,           \
+                                   out0, out1) {                         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                                  \
+                                                                         \
+  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
+  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
+  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);            \
+  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
+  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);      \
+  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);            \
+  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);               \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1, out2, out3) {                \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                            \
+  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+              res0_m, res1_m, res2_m, res3_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+              res4_m, res5_m, res6_m, res7_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+               res0_m, res1_m, res2_m, res3_m);                             \
+  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+               res4_m, res5_m, res6_m, res7_m);                             \
+  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+              res7_m, out0, out1, out2, out3);                              \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
+  v16u8 tmp_m;                                          \
+                                                        \
+  tmp_m = PCKEV_XORI128_UB(in1, in0);                   \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
+  ST_UB(tmp_m, (pdst));                                 \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
+  v16u8 tmp_m;                                           \
+                                                         \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);             \
+  ST_UB(tmp_m, (pdst));                                  \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride) {                              \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                  \
+                                                                        \
+  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                      \
+  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                  \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
+}
+#endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */

diff --git a/libvpx/vp9/common/vp9_prob.c b/libvpx/vpx_dsp/prob.c
similarity index 64%
rename from libvpx/vp9/common/vp9_prob.c
rename to libvpx/vpx_dsp/prob.c
index a1befc6..639d24d 100644
--- a/libvpx/vp9/common/vp9_prob.c
+++ b/libvpx/vpx_dsp/prob.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_prob.h"
+#include "./prob.h"
 
-const uint8_t vp9_norm[256] = {
+const uint8_t vpx_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -29,33 +29,25 @@
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-
 static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vp9_tree_index *tree,
-                                          const vp9_prob *pre_probs,
+                                          const vpx_tree_index *tree,
+                                          const vpx_prob *pre_probs,
                                           const unsigned int *counts,
-                                          unsigned int count_sat,
-                                          unsigned int max_update,
-                                          vp9_prob *probs) {
+                                          vpx_prob *probs) {
   const int l = tree[i];
   const unsigned int left_count = (l <= 0)
                  ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
   const int r = tree[i + 1];
   const unsigned int right_count = (r <= 0)
                  ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
   const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
-                              count_sat, max_update);
+  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
   return left_count + right_count;
 }
 
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
-                        max_update_factor, probs);
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }

diff --git a/libvpx/vpx_dsp/prob.h b/libvpx/vpx_dsp/prob.h
new file mode 100644
index 0000000..729f90a
--- /dev/null
+++ b/libvpx/vpx_dsp/prob.h

@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PROB_H_
+#define VPX_DSP_PROB_H_
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint8_t vpx_prob;
+
+#define MAX_PROB 255
+
+#define vpx_prob_half ((vpx_prob) 128)
+
+typedef int8_t vpx_tree_index;
+
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
+#define vpx_complement(x) (255 - x)
+
+#define MODE_MV_COUNT_SAT 20
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vpx_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vpx_tree_index vpx_tree[];
+
+static INLINE vpx_prob clip_prob(int p) {
+  return (p > 255) ? 255 : (p < 1) ? 1 : p;
+}
+
+static INLINE vpx_prob get_prob(int num, int den) {
+  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
+}
+
+static INLINE vpx_prob get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
+static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
+  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
+}
+
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
+  const unsigned int count = MIN(ct[0] + ct[1], count_sat);
+  const unsigned int factor = max_update_factor * count / count_sat;
+  return weighted_prob(pre_prob, prob, factor);
+}
+
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
+                                           const unsigned int ct[2]) {
+  const unsigned int den = ct[0] + ct[1];
+  if (den == 0) {
+    return pre_prob;
+  } else {
+    const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+    const unsigned int factor = count_to_update_factor[count];
+    const vpx_prob prob =
+        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+    return weighted_prob(pre_prob, prob, factor);
+  }
+}
+
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs);
+
+
+DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_PROB_H_

diff --git a/libvpx/vpx_dsp/psnrhvs.c b/libvpx/vpx_dsp/psnrhvs.c
new file mode 100644
index 0000000..2de77c0
--- /dev/null
+++ b/libvpx/vpx_dsp/psnrhvs.c

@@ -0,0 +1,227 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+
+#if !defined(M_PI)
+# define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  (void) xstride;
+  vpx_fdct8x8(x, y, ystride);
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const float csf_y[8][8] = {
+    {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+     0.678296995242, 0.466224900598, 0.3265091542},
+    {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+     0.868920337363, 0.61280991668, 0.436405793551},
+    {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+     0.670882927016, 0.501731932449, 0.372504254596},
+    {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575,
+     0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565},
+    {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554,
+     0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204},
+    {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+     0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321},
+    {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+     0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001},
+    {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+     0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}};
+static const float csf_cb420[8][8] = {
+    {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+     0.898018824055, 0.74725392039, 0.615105596242},
+    {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+     1.17428548929, 0.996404342439, 0.830890433625},
+    {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+     0.960060382087, 0.849823426169, 0.731221236837},
+    {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629,
+     0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374},
+    {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099,
+     0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034},
+    {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+     0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965},
+    {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+     0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733},
+    {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+     0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}};
+static const float csf_cr420[8][8] = {
+    {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+     0.867069376285, 0.721500455585, 0.593906509971},
+    {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+     1.13381474809, 0.962064122248, 0.802254508198},
+    {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+     0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706},
+    {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+     0.725539939514, 0.661776842059, 0.587716619023},
+    {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195,
+     0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273},
+    {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+     0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543},
+    {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+     0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063},
+    {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+     0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}};
+
+static double convert_score_db(double _score, double _weight) {
+  return 10 * (log10(255 * 255) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *_src, int _systride,
+                           const unsigned char *_dst, int _dystride,
+                           double _par, int _w, int _h, int _step,
+                           const float _csf[8][8]) {
+  float ret;
+  int16_t dct_s[8 * 8], dct_d[8 * 8];
+  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  float mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  (void) _par;
+  ret = pixels = 0;
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.38857 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] = (_csf[x][y] * 0.3885746225901003)
+          * (_csf[x][y] * 0.3885746225901003);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      float s_means[4];
+      float d_means[4];
+      float s_vars[4];
+      float d_vars[4];
+      float s_gmean = 0;
+      float d_gmean = 0;
+      float s_gvar = 0;
+      float d_gvar = 0;
+      float s_mask = 0;
+      float d_mask = 0;
+      for (i = 0; i < 4; i++)
+        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          dct_s[i * 8 + j] = _src[(y + i) * _systride + (j + x)];
+          dct_d[i * 8 + j] = _dst[(y + i) * _dystride + (j + x)];
+          s_gmean += dct_s[i * 8 + j];
+          d_gmean += dct_d[i * 8 + j];
+          s_means[sub] += dct_s[i * 8 + j];
+          d_means[sub] += dct_d[i * 8 + j];
+        }
+      }
+      s_gmean /= 64.f;
+      d_gmean /= 64.f;
+      for (i = 0; i < 4; i++)
+        s_means[i] /= 16.f;
+      for (i = 0; i < 4; i++)
+        d_means[i] /= 16.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
+              * (dct_s[i * 8 + j] - s_means[sub]);
+          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
+              * (dct_d[i * 8 + j] - d_means[sub]);
+        }
+      }
+      s_gvar *= 1 / 63.f * 64;
+      d_gvar *= 1 / 63.f * 64;
+      for (i = 0; i < 4; i++)
+        s_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++)
+        d_vars[i] *= 1 / 15.f * 16;
+      if (s_gvar > 0)
+        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+      if (d_gvar > 0)
+        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 32.f;
+      d_mask = sqrt(d_mask * d_gvar) / 32.f;
+      if (d_mask > s_mask)
+        s_mask = d_mask;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          float err;
+          err = fabs(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]);
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  ret /= pixels;
+  return ret;
+}
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
+                   double *u_psnrhvs, double *v_psnrhvs) {
+  double psnrhvs;
+  const double par = 1.0;
+  const int step = 7;
+  vpx_clear_system_state();
+  *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
+                            dest->y_stride, par, source->y_crop_width,
+                            source->y_crop_height, step, csf_y);
+
+  *u_psnrhvs = calc_psnrhvs(source->u_buffer, source->uv_stride, dest->u_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cb420);
+
+  *v_psnrhvs = calc_psnrhvs(source->v_buffer, source->uv_stride, dest->v_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cr420);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+
+  return convert_score_db(psnrhvs, 1.0);
+}

diff --git a/libvpx/vpx_dsp/quantize.c b/libvpx/vpx_dsp/quantize.c
new file mode 100644
index 0000000..e4e741a
--- /dev/null
+++ b/libvpx/vpx_dsp/quantize.c

@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        if (abs_qcoeff)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

diff --git a/libvpx/vpx_dsp/quantize.h b/libvpx/vpx_dsp/quantize.h
new file mode 100644
index 0000000..89ec597
--- /dev/null
+++ b/libvpx/vpx_dsp/quantize.h

@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_QUANTIZE_H_
+#define VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_QUANTIZE_H_

diff --git a/libvpx/vpx_dsp/sad.c b/libvpx/vpx_dsp/sad.c
new file mode 100644
index 0000000..c0c3ff9
--- /dev/null
+++ b/libvpx/vpx_dsp/sad.c

@@ -0,0 +1,318 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride,
+                               const uint8_t *b, int b_stride,
+                               int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
+/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
+ * The function averages every corresponding element of the buffers and stores
+ * the value in a third buffer, comp_pred.
+ * pred and comp_pred are assumed to have stride = width
+ * In the usage below comp_pred is a local array.
+ */
+static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define sadMxN(m, n) \
+unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *ref, int ref_stride) { \
+  return sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                      const uint8_t *ref, int ref_stride, \
+                                      const uint8_t *second_pred) { \
+  uint8_t comp_pred[m * n]; \
+  avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return sad(src, src_stride, comp_pred, m, m, n); \
+}
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK(m, n, k) \
+void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                const uint8_t *ref_array, int ref_stride, \
+                                uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < k; ++i) \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+}
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D(m, n) \
+void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                             const uint8_t *const ref_array[], int ref_stride, \
+                             uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < 4; ++i) \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+}
+
+// 64x64
+sadMxN(64, 64)
+sadMxNxK(64, 64, 3)
+sadMxNxK(64, 64, 8)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNxK(32, 32, 3)
+sadMxNxK(32, 32, 8)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNxK(16, 16, 3)
+sadMxNxK(16, 16, 8)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNxK(16, 8, 3)
+sadMxNxK(16, 8, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNxK(8, 16, 3)
+sadMxNxK(8, 16, 8)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNxK(8, 8, 3)
+sadMxNxK(8, 8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNxK(8, 4, 8)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNxK(4, 8, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNxK(4, 4, 3)
+sadMxNxK(4, 4, 8)
+sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint16_t *b, int b_stride,
+                                       int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define highbd_sadMxN(m, n) \
+unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride) { \
+  return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
+                                             int src_stride, \
+                                             const uint8_t *ref, \
+                                             int ref_stride, \
+                                             const uint8_t *second_pred) { \
+  uint16_t comp_pred[m * n]; \
+  highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define highbd_sadMxNxK(m, n, k) \
+void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref_array, int ref_stride, \
+                                       uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < k; ++i) { \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
+                                               ref_stride); \
+  } \
+}
+
+#define highbd_sadMxNx4D(m, n) \
+void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *const ref_array[], \
+                                    int ref_stride, uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < 4; ++i) { \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
+                                               ref_stride); \
+  } \
+}
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxNxK(64, 64, 3)
+highbd_sadMxNxK(64, 64, 8)
+highbd_sadMxNx4D(64, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxNxK(32, 32, 3)
+highbd_sadMxNxK(32, 32, 8)
+highbd_sadMxNx4D(32, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxNxK(16, 16, 3)
+highbd_sadMxNxK(16, 16, 8)
+highbd_sadMxNx4D(16, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxNxK(16, 8, 3)
+highbd_sadMxNxK(16, 8, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxNxK(8, 16, 3)
+highbd_sadMxNxK(8, 16, 8)
+highbd_sadMxNx4D(8, 16)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxNxK(8, 8, 3)
+highbd_sadMxNxK(8, 8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxNxK(8, 4, 8)
+highbd_sadMxNx4D(8, 4)
+
+// 4x8
+highbd_sadMxN(4, 8)
+highbd_sadMxNxK(4, 8, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 4x4
+highbd_sadMxN(4, 4)
+highbd_sadMxNxK(4, 4, 3)
+highbd_sadMxNxK(4, 4, 8)
+highbd_sadMxNx4D(4, 4)
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/ssim.c b/libvpx/vpx_dsp/ssim.c
new file mode 100644
index 0000000..cfe5bb3
--- /dev/null
+++ b/libvpx/vpx_dsp/ssim.c

@@ -0,0 +1,505 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
+                            int rp, uint32_t *sum_s, uint32_t *sum_r,
+                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                            uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r,
+                          uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                          uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
+                                 const uint16_t *r, int rp,
+                                 uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r,
+                         uint32_t sum_sq_s, uint32_t sum_sq_r,
+                         uint32_t sum_sxr, int count) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+
+  // scale the constants by number of pixels
+  c1 = (cc1 * count * count) >> 12;
+  c2 = (cc2 * count * count) >> 12;
+
+  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
+                                       (int64_t) 2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, unsigned int bd) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  const int oshift = bd - 8;
+  vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> oshift,
+                    sum_r >> oshift,
+                    sum_sq_s >> (2 * oshift),
+                    sum_sq_r >> (2 * oshift),
+                    sum_sxr >> (2 * oshift),
+                    64);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double vpx_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width,
+                        int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                               int stride_img1, int stride_img2, int width,
+                               int height, unsigned int bd) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
+                                 bd);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest,
+                     double *weight) {
+  double a, b, c;
+  double ssimv;
+
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
+
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                      const YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
+
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+      (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
+         - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+         n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
+                     &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
+                     &sv->sum_sxr);
+}
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  vpx_clear_system_state();
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height; i += 4,
+       img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = {0};
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term = (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term = (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
+                               (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                               5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1)
+          ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0)
+    inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest,
+                            double *weight, unsigned int bd) {
+  double a, b, c;
+  double ssimv;
+
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                             const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v, unsigned int bd) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/ssim.h b/libvpx/vpx_dsp/ssim.h
new file mode 100644
index 0000000..132f7f9
--- /dev/null
+++ b/libvpx/vpx_dsp/ssim.h

@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_SSIM_H_
+#define VPX_DSP_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint32_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint32_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint32_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint32_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint32_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                      int img2_pitch, int width, int height, Ssimv *sv2,
+                      Metrics *m, int do_inconsistency);
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest,
+                     double *weight);
+
+double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                      const YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest,
+                   double *ssim_y, double *ssim_u, double *ssim_v);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest,
+                            double *weight,
+                            unsigned int bd);
+
+double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                             const YV12_BUFFER_CONFIG *dest,
+                             double *ssim_y,
+                             double *ssim_u,
+                             double *ssim_v,
+                             unsigned int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_SSIM_H_

diff --git a/libvpx/vpx_dsp/subtract.c b/libvpx/vpx_dsp/subtract.c
new file mode 100644
index 0000000..556e013
--- /dev/null
+++ b/libvpx/vpx_dsp/subtract.c

@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols,
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
+      diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols,
+                                 int16_t *diff, ptrdiff_t diff_stride,
+                                 const uint8_t *src8, ptrdiff_t src_stride,
+                                 const uint8_t *pred8, ptrdiff_t pred_stride,
+                                 int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/txfm_common.h b/libvpx/vpx_dsp/txfm_common.h
new file mode 100644
index 0000000..442e6a5
--- /dev/null
+++ b/libvpx/vpx_dsp/txfm_common.h

@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_TXFM_COMMON_H_
+#define VPX_DSP_TXFM_COMMON_H_
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64  = 16364;
+static const tran_high_t cospi_2_64  = 16305;
+static const tran_high_t cospi_3_64  = 16207;
+static const tran_high_t cospi_4_64  = 16069;
+static const tran_high_t cospi_5_64  = 15893;
+static const tran_high_t cospi_6_64  = 15679;
+static const tran_high_t cospi_7_64  = 15426;
+static const tran_high_t cospi_8_64  = 15137;
+static const tran_high_t cospi_9_64  = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+#endif  // VPX_DSP_TXFM_COMMON_H_

diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c
new file mode 100644
index 0000000..e8bddb0
--- /dev/null
+++ b/libvpx/vpx_dsp/variance.c

@@ -0,0 +1,621 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
+                            const uint8_t *b, int  b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+uint32_t vpx_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
+                                       b, b_stride, sse);
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
+                                       b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
+                                       b, b_stride, sse);
+}
+
+static void variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     int  w, int  h, uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+#define VAR(W, H) \
+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define SUBPIX_VAR(W, H) \
+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                             int xoffset, int  yoffset, \
+                                             const uint8_t *b, int b_stride, \
+                                             uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int  a_stride, \
+                                                 int xoffset, int  yoffset, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 uint32_t *sse, \
+                                                 const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+                             const uint8_t *b, int b_stride, \
+                             uint32_t *sse, int *sum) { \
+  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                              const uint8_t *b, int b_stride, \
+                              uint32_t *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+    VAR(W, H) \
+    SUBPIX_VAR(W, H) \
+    SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                         int width, int height,
+                         const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void highbd_8_variance(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                            int a_stride, \
+                                            const uint8_t *b, \
+                                            int b_stride, \
+                                            uint32_t *sse) { \
+  int sum; \
+  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
+  int sum; \
+  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
+  int sum; \
+  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                      const uint8_t *ref, int ref_stride, \
+                                      uint32_t *sse, int *sum) { \
+  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                       int src_stride, \
+                                       const uint8_t *ref, \
+                                       int ref_stride, \
+                                       uint32_t *sse) { \
+  int sum; \
+  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
+  int sum; \
+  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
+  int sum; \
+  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+static void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+    HIGHBD_VAR(W, H) \
+    HIGHBD_SUBPIX_VAR(W, H) \
+    HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                              int width, int height, const uint8_t *ref8,
+                              int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/variance.h b/libvpx/vpx_dsp/variance.h
new file mode 100644
index 0000000..cd0fd98
--- /dev/null
+++ b/libvpx/vpx_dsp/variance.h

@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VARIANCE_H_
+#define VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+                                    const uint8_t *b_ptr, int b_stride);
+
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+                                        const uint8_t *b_ptr, int b_stride,
+                                        const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
+                                  uint8_t *b, int b_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride,
+                                     unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+                                                   int a_stride,
+                                                   int xoffset, int yoffset,
+                                                   const uint8_t *b_ptr,
+                                                   int b_stride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+#if CONFIG_VP8
+typedef struct variance_vtable {
+  vpx_sad_fn_t            sdf;
+  vpx_variance_fn_t       vf;
+  vpx_subpixvariance_fn_t svf;
+  vpx_variance_fn_t       svf_halfpix_h;
+  vpx_variance_fn_t       svf_halfpix_v;
+  vpx_variance_fn_t       svf_halfpix_hv;
+  vpx_sad_multi_fn_t      sdx3f;
+  vpx_sad_multi_fn_t      sdx8f;
+  vpx_sad_multi_d_fn_t    sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+  vp8_copy32xn_fn_t       copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif  // CONFIG_VP8
+
+#if CONFIG_VP9 || CONFIG_VP10
+typedef struct vp9_variance_vtable {
+  vpx_sad_fn_t               sdf;
+  vpx_sad_avg_fn_t           sdaf;
+  vpx_variance_fn_t          vf;
+  vpx_subpixvariance_fn_t    svf;
+  vpx_subp_avg_variance_fn_t svaf;
+  vpx_sad_multi_fn_t         sdx3f;
+  vpx_sad_multi_fn_t         sdx8f;
+  vpx_sad_multi_d_fn_t       sdx4df;
+} vp9_variance_fn_ptr_t;
+#endif  // CONFIG_VP9 || CONFIG_VP10
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VARIANCE_H_

diff --git a/libvpx/vpx_dsp/vpx_convolve.c b/libvpx/vpx_dsp/vpx_convolve.c
new file mode 100644
index 0000000..2d1c927
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_convolve.c

@@ -0,0 +1,612 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters,
+                           int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters,
+                               int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters,
+                              int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const InterpKernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const InterpKernel *const y_filters,
+                     int y0_q4, int y_step_q4,
+                     int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                 x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                     x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4,
+                          int w, int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                    y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve(src, src_stride, dst, dst_stride,
+           filters_x, x0_q4, x_step_q4,
+           filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_c(src, src_stride, temp, 64,
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_y, int filter_y_stride,
+                         int w, int h) {
+  int r;
+
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_y, int filter_y_stride,
+                        int w, int h) {
+  int x, y;
+
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x)
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *filter_x, int x_step_q4,
+                       const int16_t *filter_y, int y_step_q4,
+                       int w, int h) {
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                       filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                      filter_y, y_step_q4, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters,
+                                  int x0_q4, int x_step_q4,
+                                  int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                      uint8_t *dst8, ptrdiff_t dst_stride,
+                                      const InterpKernel *x_filters,
+                                      int x0_q4, int x_step_q4,
+                                      int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters,
+                                 int y0_q4, int y_step_q4, int w, int h,
+                                 int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel_highbd(
+          ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const InterpKernel *y_filters,
+                                     int y0_q4, int y_step_q4, int w, int h,
+                                     int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *const x_filters,
+                            int x0_q4, int x_step_q4,
+                            const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4,
+                            int w, int h, int bd) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint16_t temp[64 * 135];
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                        x_filters, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
+                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
+                       w, h, bd);
+}
+
+
+void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                        x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                            x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                       y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                           y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  highbd_convolve(src, src_stride, dst, dst_stride,
+                  filters_x, x0_q4, x_step_q4,
+                  filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4,
+                                int w, int h, int bd) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+                            NULL, 0, NULL, 0, w, h, bd);
+}
+
+void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
+  int r;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w * sizeof(uint16_t));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+                               uint8_t *dst8, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int filter_x_stride,
+                               const int16_t *filter_y, int filter_y_stride,
+                               int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif

diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vpx_dsp/vpx_convolve.h
similarity index 63%
rename from libvpx/vp9/common/vp9_convolve.h
rename to libvpx/vpx_dsp/vpx_convolve.h
index 6bf71fc..9ed3f17 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vpx_dsp/vpx_convolve.h

@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_VP9_CONVOLVE_H_
-#define VP9_COMMON_VP9_CONVOLVE_H_
+#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_DSP_VPX_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -23,8 +23,16 @@
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_CONVOLVE_H_
+#endif  // VPX_DSP_VPX_CONVOLVE_H_

diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
new file mode 100644
index 0000000..1959c4d
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_dsp.mk

@@ -0,0 +1,337 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-yes += vpx_dsp_common.h
+
+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
+
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# intra predictions
+ifneq ($(filter yes,$(CONFIG_VP9) $(CONFIG_VP10)),)
+DSP_SRCS-yes += intrapred.c
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+endif  # CONFIG_USE_X86INC
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
+endif  # CONFIG_VP9 || CONFIG_VP10
+
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
+
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += vpx_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+endif
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+endif
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+
+# common (dspr2)
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_vert_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_macros_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-yes            += txfm_common.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
+DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+# forward transform
+ifneq ($(filter yes,$(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+# inverse transform
+ifneq ($(filter yes,$(CONFIG_VP9) $(CONFIG_VP10)),)
+DSP_SRCS-yes            += inv_txfm.h
+DSP_SRCS-yes            += inv_txfm.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
+endif  # ARCH_X86_64
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
+DSP_SRCS-yes  += arm/idct4x4_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_add_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
+
+DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_VP9 || CONFIG_VP10
+
+# quantization
+ifneq ($(filter yes, $(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+endif
+endif
+endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes            += sad.c
+DSP_SRCS-yes            += subtract.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
+DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_USE_X86INC
+
+endif  # CONFIG_ENCODERS
+
+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes            += variance.c
+DSP_SRCS-yes            += variance.h
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
+DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
+endif  # ARCH_X86_64
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+endif  # CONFIG_USE_X86INC
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+
+$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))

diff --git a/libvpx/vpx_dsp/vpx_dsp_common.h b/libvpx/vpx_dsp/vpx_dsp_common.h
new file mode 100644
index 0000000..ccb8189
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_dsp_common.h

@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_COMMON_H_
+#define VPX_DSP_COMMON_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default:
+      return (uint16_t)clamp(val, 0, 255);
+    case 10:
+      return (uint16_t)clamp(val, 0, 1023);
+    case 12:
+      return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_COMMON_H_

diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/libvpx/vpx_dsp/vpx_dsp_rtcd.c
new file mode 100644
index 0000000..5fe27b6
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd.c

@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_dsp_rtcd() {
+  once(setup_rtcd_internal);
+}

diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
new file mode 100644
index 0000000..1e56d53
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -0,0 +1,1744 @@
+sub vpx_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+EOF
+}
+forward_decls qw/vpx_dsp_forward_decls/;
+
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
+}
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+#
+# Intra prediction
+#
+
+if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+  add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d63_predictor_4x4/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d117_predictor_4x4/;
+
+  add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d135_predictor_4x4 neon/;
+
+  add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+
+  add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d63_predictor_8x8/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d117_predictor_8x8/;
+
+  add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d135_predictor_8x8/;
+
+  add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
+
+  add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d45_predictor_16x16 neon/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d63_predictor_16x16/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d117_predictor_16x16/;
+
+  add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d135_predictor_16x16/;
+
+  add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d153_predictor_16x16/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_v_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d207_predictor_32x32/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d45_predictor_32x32/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d63_predictor_32x32/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d117_predictor_32x32/;
+
+  add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d135_predictor_32x32/;
+
+  add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_d153_predictor_32x32/, "$ssse3_x86inc";
+
+  add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
+
+  add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vpx_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+# High bitdepth functions
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d207_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d45_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d63_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_h_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d117_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d135_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d153_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+
+    add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_top_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_left_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_128_predictor_4x4/;
+
+    add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d207_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d45_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d63_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_h_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d117_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d135_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d153_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_v_predictor_8x8/, "$sse2_x86inc";
+
+    add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_tm_predictor_8x8/, "$sse2_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
+
+    add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_top_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_left_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_128_predictor_8x8/;
+
+    add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d207_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d45_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d63_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_h_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d117_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d135_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d153_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
+
+    add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_top_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_left_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_128_predictor_16x16/;
+
+    add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d207_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d45_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d63_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_h_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d117_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d135_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_d153_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
+
+    add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+
+    add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_top_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_left_predictor_32x32/;
+
+    add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vpx_highbd_dc_128_predictor_32x32/;
+  }  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9 || CONFIG_VP10
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3/;
+
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_horiz/;
+
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_vert/;
+
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_2d/;
+
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_horiz/;
+
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_vert/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_copy/;
+
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_avg/;
+
+  add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4 sse2/;
+
+  add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4_1 sse2/;
+
+  add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8 sse2/;
+
+  add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8_1 sse2/;
+
+  add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16 sse2/;
+
+  add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16_1 sse2/;
+
+  add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32 sse2/;
+
+  add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_rd sse2/;
+
+  add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_1 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct4x4 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8_1/;
+
+  add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1/;
+
+  add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+
+  add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1/;
+} else {
+  add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4 sse2 msa/;
+
+  add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4_1 sse2/;
+
+  add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
+
+  add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16 sse2 msa/;
+
+  add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16_1 sse2 msa/;
+
+  add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+
+  add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+
+  add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_1 sse2 msa/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+#
+# Inverse transform
+if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct4x4_1_add/;
+
+  add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct4x4_16_add/;
+
+  add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct8x8_1_add/;
+
+  add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct8x8_64_add/;
+
+  add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct8x8_12_add/;
+
+  add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct16x16_1_add/;
+
+  add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct16x16_256_add/;
+
+  add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct16x16_10_add/;
+
+  add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct32x32_1024_add/;
+
+  add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct32x32_34_add/;
+
+  add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_idct32x32_1_add/;
+
+  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_iwht4x4_1_add/;
+
+  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_iwht4x4_16_add/;
+
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add/;
+
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct8x8_1_add/;
+
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1024_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_34_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1_add/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_iwht4x4_1_add/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_10_add/;
+  } else {
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_10_add sse2/;
+  }  # CONFIG_EMULATE_HARDWARE
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add/;
+
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add/;
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add/;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add/;
+
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_1_add/;
+
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_16_add/;
+  } else {
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/;
+    # Need to add 34 eob idct32x32 neon implementation.
+    $vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_1_add msa/;
+
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_16_add msa/, "$sse2_x86inc";
+  }  # CONFIG_EMULATE_HARDWARE
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9 || CONFIG_VP10
+
+#
+# Quantization
+#
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b/;
+
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b_32x32/;
+
+  add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_highbd_quantize_b sse2/;
+
+  add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+} else {
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8 msa/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc";
+
+#
+# Avg
+#
+add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x8_avg msa/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x4_avg msa/, "$sse_x86inc";
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x3 msa/;
+
+add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x3 msa/;
+
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x3 sse3 msa/;
+
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x3 sse3 msa/;
+
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x3 sse3 msa/;
+
+# Blocks of 8
+add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x8 msa/;
+
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x8 msa/;
+
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x8 msa/;
+
+add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x8 msa/;
+
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x4d msa/, "$sse_x86inc";
+
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+
+  #
+  # Single block SAD
+  #
+  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4/;
+
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg/;
+
+  #
+  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+  #
+  # Blocks of 3
+  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x3/;
+
+  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x3/;
+
+  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x3/;
+
+  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x3/;
+
+  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x3/;
+
+  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x3/;
+
+  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x3/;
+
+  # Blocks of 8
+  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x8/;
+
+  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x8/;
+
+  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x8/;
+
+  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x8/;
+
+  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x8/;
+
+  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x8/;
+
+  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x8/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_highbd_ssim_parms_8x8/;
+  }
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+#
+# Variance
+#
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 mmx sse2 msa/;
+
+#
+# Specialty Variance
+#
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2 msa/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+  specialize qw/vpx_get4x4sse_cs neon msa/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+  #
+  # Subpixel Variance
+  #
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+1;

diff --git a/libvpx/vpx_dsp/vpx_filter.h b/libvpx/vpx_dsp/vpx_filter.h
new file mode 100644
index 0000000..2617feb
--- /dev/null
+++ b/libvpx/vpx_dsp/vpx_filter.h

@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VPX_FILTER_H_
+#define VPX_DSP_VPX_FILTER_H_
+
+#include "vpx/vpx_integer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VPX_FILTER_H_

diff --git a/libvpx/vpx_dsp/x86/convolve.h b/libvpx/vpx_dsp/x86/convolve.h
new file mode 100644
index 0000000..b6fbfcf
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/convolve.h

@@ -0,0 +1,290 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_CONVOLVE_H_
+#define VPX_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+  const uint8_t *src_ptr,
+  ptrdiff_t src_pitch,
+  uint8_t *output_ptr,
+  ptrdiff_t out_pitch,
+  uint32_t output_height,
+  const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                    uint8_t *dst, ptrdiff_t dst_stride, \
+                                    const int16_t *filter_x, int x_step_q4, \
+                                    const int16_t *filter_y, int y_step_q4, \
+                                    int w, int h) { \
+  assert(filter[3] != 128); \
+  assert(step_q4 == 16); \
+  if (filter[0] || filter[1] || filter[2]) { \
+    while (w >= 16) { \
+      vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                               src_stride, \
+                                               dst, \
+                                               dst_stride, \
+                                               h, \
+                                               filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } else { \
+    while (w >= 16) { \
+      vpx_filter_block1d16_##dir##2_##avg##opt(src, \
+                                               src_stride, \
+                                               dst, \
+                                               dst_stride, \
+                                               h, \
+                                               filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vpx_filter_block1d8_##dir##2_##avg##opt(src, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vpx_filter_block1d4_##dir##2_##avg##opt(src, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  assert(filter_x[3] != 128); \
+  assert(filter_y[3] != 128); \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  assert(x_step_q4 == 16); \
+  assert(y_step_q4 == 16); \
+  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
+      filter_y[0] || filter_y[1] || filter_y[2]) { \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 7); \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, \
+                                    y_step_q4, w, h); \
+  } else { \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 1); \
+    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, \
+                                    y_step_q4, w, h); \
+  } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+  const uint16_t *src_ptr,
+  const ptrdiff_t src_pitch,
+  uint16_t *output_ptr,
+  ptrdiff_t out_pitch,
+  unsigned int output_height,
+  const int16_t *filter,
+  int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+                                           ptrdiff_t src_stride, \
+                                           uint8_t *dst8, \
+                                           ptrdiff_t dst_stride, \
+                                           const int16_t *filter_x, \
+                                           int x_step_q4, \
+                                           const int16_t *filter_y, \
+                                           int y_step_q4, \
+                                           int w, int h, int bd) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd); \
+  } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                     uint8_t *dst, ptrdiff_t dst_stride, \
+                                     const int16_t *filter_x, int x_step_q4, \
+                                     const int16_t *filter_y, int y_step_q4, \
+                                     int w, int h, int bd) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 7, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+                                             64, dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } else { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 1, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                             dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } \
+  } else { \
+    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                  h, bd); \
+  } \
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_DSP_X86_CONVOLVE_H_

diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
similarity index 98%
rename from libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
rename to libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 9ea22fe..4df39df 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h

@@ -9,17 +9,18 @@
  */
 
 #include <immintrin.h>  // AVX2
-#include "vp9/common/vp9_idct.h"  // for cospi constants
-#include "vpx_ports/mem.h"
+
+#include "vpx_dsp/txfm_common.h"
 
 #define pair256_set_epi16(a, b) \
-  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
 #define pair256_set_epi32(a, b) \
-  _mm256_set_epi32(b, a, b, a, b, a, b, a)
-
-
-
+  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
+                   (int)(b), (int)(a), (int)(b), (int)(a))
 
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -50,7 +51,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
   const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
   const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);

diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
similarity index 81%
rename from libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
rename to libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 42fdbbd..b85ae10 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h

@@ -9,31 +9,55 @@
  */
 
 #include <emmintrin.h>  // SSE2
-#include "vp9/common/vp9_idct.h"  // for cospi constants
-#include "vpx_ports/mem.h"
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32(b, a, b, a)
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
 
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
 #if FDCT32x32_HIGH_PRECISION
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
-  __m128i buf0, buf1;
-  buf0 = _mm_mul_epu32(a, b);
-  a = _mm_srli_epi64(a, 32);
-  b = _mm_srli_epi64(b, 32);
-  buf1 = _mm_mul_epu32(a, b);
-  return _mm_add_epi64(buf0, buf1);
+void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vpx_fdct32(temp_in, temp_out, 0);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
 }
+  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#else
+void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vpx_fdct32(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] = (tran_low_t)temp_out[j];
+    }
+}
+  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#endif  // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif  // DCT_HIGH_BIT_DEPTH
 
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
-  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm_unpacklo_epi64(buf0, buf1);
-}
-#endif
 
 void FDCT32x32_2D(const int16_t *input,
-                  int16_t *output_org, int stride) {
+                  tran_low_t *output_org, int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -44,7 +68,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -84,6 +108,9 @@
   const __m128i kOne  = _mm_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
   for (pass = 0; pass < 2; ++pass) {
     // We process eight columns (transposed rows in second pass) at a time.
     int column_start;
@@ -237,14 +264,23 @@
           __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
           __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
           __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[ 0] = _mm_add_epi16(in00, in31);
-          step1[ 1] = _mm_add_epi16(in01, in30);
-          step1[ 2] = _mm_add_epi16(in02, in29);
-          step1[ 3] = _mm_add_epi16(in03, in28);
-          step1[28] = _mm_sub_epi16(in03, in28);
-          step1[29] = _mm_sub_epi16(in02, in29);
-          step1[30] = _mm_sub_epi16(in01, in30);
-          step1[31] = _mm_sub_epi16(in00, in31);
+          step1[0] = ADD_EPI16(in00, in31);
+          step1[1] = ADD_EPI16(in01, in30);
+          step1[2] = ADD_EPI16(in02, in29);
+          step1[3] = ADD_EPI16(in03, in28);
+          step1[28] = SUB_EPI16(in03, in28);
+          step1[29] = SUB_EPI16(in02, in29);
+          step1[30] = SUB_EPI16(in01, in30);
+          step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+                                             &step1[3], &step1[28], &step1[29],
+                                             &step1[30], &step1[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
@@ -255,14 +291,23 @@
           __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
           __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
           __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[ 4] = _mm_add_epi16(in04, in27);
-          step1[ 5] = _mm_add_epi16(in05, in26);
-          step1[ 6] = _mm_add_epi16(in06, in25);
-          step1[ 7] = _mm_add_epi16(in07, in24);
-          step1[24] = _mm_sub_epi16(in07, in24);
-          step1[25] = _mm_sub_epi16(in06, in25);
-          step1[26] = _mm_sub_epi16(in05, in26);
-          step1[27] = _mm_sub_epi16(in04, in27);
+          step1[4] = ADD_EPI16(in04, in27);
+          step1[5] = ADD_EPI16(in05, in26);
+          step1[6] = ADD_EPI16(in06, in25);
+          step1[7] = ADD_EPI16(in07, in24);
+          step1[24] = SUB_EPI16(in07, in24);
+          step1[25] = SUB_EPI16(in06, in25);
+          step1[26] = SUB_EPI16(in05, in26);
+          step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+                                             &step1[7], &step1[24], &step1[25],
+                                             &step1[26], &step1[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
@@ -273,14 +318,23 @@
           __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
           __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
           __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[ 8] = _mm_add_epi16(in08, in23);
-          step1[ 9] = _mm_add_epi16(in09, in22);
-          step1[10] = _mm_add_epi16(in10, in21);
-          step1[11] = _mm_add_epi16(in11, in20);
-          step1[20] = _mm_sub_epi16(in11, in20);
-          step1[21] = _mm_sub_epi16(in10, in21);
-          step1[22] = _mm_sub_epi16(in09, in22);
-          step1[23] = _mm_sub_epi16(in08, in23);
+          step1[8] = ADD_EPI16(in08, in23);
+          step1[9] = ADD_EPI16(in09, in22);
+          step1[10] = ADD_EPI16(in10, in21);
+          step1[11] = ADD_EPI16(in11, in20);
+          step1[20] = SUB_EPI16(in11, in20);
+          step1[21] = SUB_EPI16(in10, in21);
+          step1[22] = SUB_EPI16(in09, in22);
+          step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+                                             &step1[11], &step1[20], &step1[21],
+                                             &step1[22], &step1[23]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
@@ -291,34 +345,57 @@
           __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
           __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
           __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = _mm_add_epi16(in12, in19);
-          step1[13] = _mm_add_epi16(in13, in18);
-          step1[14] = _mm_add_epi16(in14, in17);
-          step1[15] = _mm_add_epi16(in15, in16);
-          step1[16] = _mm_sub_epi16(in15, in16);
-          step1[17] = _mm_sub_epi16(in14, in17);
-          step1[18] = _mm_sub_epi16(in13, in18);
-          step1[19] = _mm_sub_epi16(in12, in19);
+          step1[12] = ADD_EPI16(in12, in19);
+          step1[13] = ADD_EPI16(in13, in18);
+          step1[14] = ADD_EPI16(in14, in17);
+          step1[15] = ADD_EPI16(in15, in16);
+          step1[16] = SUB_EPI16(in15, in16);
+          step1[17] = SUB_EPI16(in14, in17);
+          step1[18] = SUB_EPI16(in13, in18);
+          step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+                                             &step1[15], &step1[16], &step1[17],
+                                             &step1[18], &step1[19]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
       }
       // Stage 2
       {
-        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
-        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
-        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
-        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
-        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
-        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
-        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
-        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
-        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
-        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
-        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+        step2[0] = ADD_EPI16(step1[0], step1[15]);
+        step2[1] = ADD_EPI16(step1[1], step1[14]);
+        step2[2] = ADD_EPI16(step1[2], step1[13]);
+        step2[3] = ADD_EPI16(step1[3], step1[12]);
+        step2[4] = ADD_EPI16(step1[4], step1[11]);
+        step2[5] = ADD_EPI16(step1[5], step1[10]);
+        step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+        step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+        step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+        step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+        step2[10] = SUB_EPI16(step1[5], step1[10]);
+        step2[11] = SUB_EPI16(step1[4], step1[11]);
+        step2[12] = SUB_EPI16(step1[3], step1[12]);
+        step2[13] = SUB_EPI16(step1[2], step1[13]);
+        step2[14] = SUB_EPI16(step1[1], step1[14]);
+        step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
@@ -387,6 +464,18 @@
         step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+                                           &step2[23], &step2[24], &step2[25],
+                                           &step2[26], &step2[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 
 #if !FDCT32x32_HIGH_PRECISION
@@ -426,49 +515,63 @@
         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
 
-        step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
-        step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
-        step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
-        step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
-        step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
-        step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
-        step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
-        step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
-        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
-        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
-        step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
-        step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
-        step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
-        step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
-        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
-        step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
-        step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
-        step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
-        step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
-        step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
-        step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
-        step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
-        step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
-        step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
-        step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
-        step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
-        step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
-        step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
-        step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
-        step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
-        step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
-
-        step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
-        step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
-        step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
-        step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
-        step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
-        step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
-        step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
-        step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
-        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
-        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+        step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+        step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+        step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+        step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+        step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+        step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+        step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+        step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+        step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+        step2[10] = SUB_EPI16(step2[10], s3_10_0);
+        step2[11] = SUB_EPI16(step2[11], s3_11_0);
+        step2[12] = SUB_EPI16(step2[12], s3_12_0);
+        step2[13] = SUB_EPI16(step2[13], s3_13_0);
+        step2[14] = SUB_EPI16(step2[14], s2_14_0);
+        step2[15] = SUB_EPI16(step2[15], s2_15_0);
+        step1[16] = SUB_EPI16(step1[16], s3_16_0);
+        step1[17] = SUB_EPI16(step1[17], s3_17_0);
+        step1[18] = SUB_EPI16(step1[18], s3_18_0);
+        step1[19] = SUB_EPI16(step1[19], s3_19_0);
+        step2[20] = SUB_EPI16(step2[20], s3_20_0);
+        step2[21] = SUB_EPI16(step2[21], s3_21_0);
+        step2[22] = SUB_EPI16(step2[22], s3_22_0);
+        step2[23] = SUB_EPI16(step2[23], s3_23_0);
+        step2[24] = SUB_EPI16(step2[24], s3_24_0);
+        step2[25] = SUB_EPI16(step2[25], s3_25_0);
+        step2[26] = SUB_EPI16(step2[26], s3_26_0);
+        step2[27] = SUB_EPI16(step2[27], s3_27_0);
+        step1[28] = SUB_EPI16(step1[28], s3_28_0);
+        step1[29] = SUB_EPI16(step1[29], s3_29_0);
+        step1[30] = SUB_EPI16(step1[30], s3_30_0);
+        step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x32(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15],
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        step2[0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[9] = _mm_add_epi16(step2[ 9], kOne);
         step2[10] = _mm_add_epi16(step2[10], kOne);
         step2[11] = _mm_add_epi16(step2[11], kOne);
         step2[12] = _mm_add_epi16(step2[12], kOne);
@@ -492,16 +595,16 @@
         step1[30] = _mm_add_epi16(step1[30], kOne);
         step1[31] = _mm_add_epi16(step1[31], kOne);
 
-        step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
-        step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
-        step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
-        step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
-        step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
-        step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
-        step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
-        step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
-        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
-        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[9] = _mm_srai_epi16(step2[ 9], 2);
         step2[10] = _mm_srai_epi16(step2[10], 2);
         step2[11] = _mm_srai_epi16(step2[11], 2);
         step2[12] = _mm_srai_epi16(step2[12], 2);
@@ -525,21 +628,33 @@
         step1[30] = _mm_srai_epi16(step1[30], 2);
         step1[31] = _mm_srai_epi16(step1[31], 2);
       }
-#endif
+#endif  // !FDCT32x32_HIGH_PRECISION
 
 #if FDCT32x32_HIGH_PRECISION
       if (pass == 0) {
 #endif
       // Stage 3
       {
-        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+        step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+        step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+        step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+        step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+        step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+        step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+        step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+        step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                           &step3[3], &step3[4], &step3[5],
+                                           &step3[6], &step3[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -576,40 +691,79 @@
         step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
         step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
         step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+                                           &step3[12], &step3[13]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step3[16] = _mm_add_epi16(step2[23], step1[16]);
-        step3[17] = _mm_add_epi16(step2[22], step1[17]);
-        step3[18] = _mm_add_epi16(step2[21], step1[18]);
-        step3[19] = _mm_add_epi16(step2[20], step1[19]);
-        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-        step3[28] = _mm_add_epi16(step2[27], step1[28]);
-        step3[29] = _mm_add_epi16(step2[26], step1[29]);
-        step3[30] = _mm_add_epi16(step2[25], step1[30]);
-        step3[31] = _mm_add_epi16(step2[24], step1[31]);
+        step3[16] = ADD_EPI16(step2[23], step1[16]);
+        step3[17] = ADD_EPI16(step2[22], step1[17]);
+        step3[18] = ADD_EPI16(step2[21], step1[18]);
+        step3[19] = ADD_EPI16(step2[20], step1[19]);
+        step3[20] = SUB_EPI16(step1[19], step2[20]);
+        step3[21] = SUB_EPI16(step1[18], step2[21]);
+        step3[22] = SUB_EPI16(step1[17], step2[22]);
+        step3[23] = SUB_EPI16(step1[16], step2[23]);
+        step3[24] = SUB_EPI16(step1[31], step2[24]);
+        step3[25] = SUB_EPI16(step1[30], step2[25]);
+        step3[26] = SUB_EPI16(step1[29], step2[26]);
+        step3[27] = SUB_EPI16(step1[28], step2[27]);
+        step3[28] = ADD_EPI16(step2[27], step1[28]);
+        step3[29] = ADD_EPI16(step2[26], step1[29]);
+        step3[30] = ADD_EPI16(step2[25], step1[30]);
+        step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step3[16], &step3[17], &step3[18], &step3[19],
+            &step3[20], &step3[21], &step3[22], &step3[23],
+            &step3[24], &step3[25], &step3[26], &step3[27],
+            &step3[28], &step3[29], &step3[30], &step3[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 
       // Stage 4
       {
-        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
-        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
-        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
-        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
-        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
-        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
-        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
-        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
-        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-        step1[14] = _mm_add_epi16(step3[13], step2[14]);
-        step1[15] = _mm_add_epi16(step3[12], step2[15]);
+        step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+        step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+        step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+        step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+        step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+        step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+        step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+        step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+        step1[12] = SUB_EPI16(step2[15], step3[12]);
+        step1[13] = SUB_EPI16(step2[14], step3[13]);
+        step1[14] = ADD_EPI16(step3[13], step2[14]);
+        step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[0], &step1[1], &step1[2], &step1[3],
+            &step1[4], &step1[5], &step1[6], &step1[7],
+            &step1[8], &step1[9], &step1[10], &step1[11],
+            &step1[12], &step1[13], &step1[14], &step1[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
@@ -630,6 +784,16 @@
         // Combine
         step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
         step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
@@ -698,13 +862,36 @@
         step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
         step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
         step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                           &step1[21], &step1[26], &step1[27],
+                                           &step1[28], &step1[29]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 5
       {
-        step2[4] = _mm_add_epi16(step1[5], step3[4]);
-        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-        step2[7] = _mm_add_epi16(step1[6], step3[7]);
+        step2[4] = ADD_EPI16(step1[5], step3[4]);
+        step2[5] = SUB_EPI16(step3[4], step1[5]);
+        step2[6] = SUB_EPI16(step3[7], step1[6]);
+        step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+                                           &step2[6], &step2[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
@@ -741,6 +928,17 @@
         out[16] = _mm_packs_epi32(out_16_6, out_16_7);
         out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
         out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                           &out[8], &out[24]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
@@ -777,24 +975,49 @@
         step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
         step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
         step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+                                           &step2[13], &step2[14]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step2[16] = _mm_add_epi16(step1[19], step3[16]);
-        step2[17] = _mm_add_epi16(step1[18], step3[17]);
-        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-        step2[22] = _mm_add_epi16(step1[21], step3[22]);
-        step2[23] = _mm_add_epi16(step1[20], step3[23]);
-        step2[24] = _mm_add_epi16(step1[27], step3[24]);
-        step2[25] = _mm_add_epi16(step1[26], step3[25]);
-        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-        step2[30] = _mm_add_epi16(step1[29], step3[30]);
-        step2[31] = _mm_add_epi16(step1[28], step3[31]);
+        step2[16] = ADD_EPI16(step1[19], step3[16]);
+        step2[17] = ADD_EPI16(step1[18], step3[17]);
+        step2[18] = SUB_EPI16(step3[17], step1[18]);
+        step2[19] = SUB_EPI16(step3[16], step1[19]);
+        step2[20] = SUB_EPI16(step3[23], step1[20]);
+        step2[21] = SUB_EPI16(step3[22], step1[21]);
+        step2[22] = ADD_EPI16(step1[21], step3[22]);
+        step2[23] = ADD_EPI16(step1[20], step3[23]);
+        step2[24] = ADD_EPI16(step1[27], step3[24]);
+        step2[25] = ADD_EPI16(step1[26], step3[25]);
+        step2[26] = SUB_EPI16(step3[25], step1[26]);
+        step2[27] = SUB_EPI16(step3[24], step1[27]);
+        step2[28] = SUB_EPI16(step3[31], step1[28]);
+        step2[29] = SUB_EPI16(step3[30], step1[29]);
+        step2[30] = ADD_EPI16(step1[29], step3[30]);
+        step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[16], &step2[17], &step2[18], &step2[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step2[28], &step2[29], &step2[30], &step2[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 6
       {
@@ -832,20 +1055,43 @@
         const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
         const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
         // Combine
-        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[4] = _mm_packs_epi32(out_04_6, out_04_7);
         out[20] = _mm_packs_epi32(out_20_6, out_20_7);
         out[12] = _mm_packs_epi32(out_12_6, out_12_7);
         out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                           &out[12], &out[28]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
-        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
-        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-        step3[11] = _mm_add_epi16(step2[10], step1[11]);
-        step3[12] = _mm_add_epi16(step2[13], step1[12]);
-        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-        step3[15] = _mm_add_epi16(step2[14], step1[15]);
+        step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+        step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+        step3[10] = SUB_EPI16(step1[11], step2[10]);
+        step3[11] = ADD_EPI16(step2[10], step1[11]);
+        step3[12] = ADD_EPI16(step2[13], step1[12]);
+        step3[13] = SUB_EPI16(step1[12], step2[13]);
+        step3[14] = SUB_EPI16(step1[15], step2[14]);
+        step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                           &step3[11], &step3[12], &step3[13],
+                                           &step3[14], &step3[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
@@ -915,6 +1161,18 @@
         step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
         step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
         step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                           &step3[22], &step3[25], &step3[26],
+                                           &step3[29], &step3[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 7
       {
@@ -984,24 +1242,50 @@
         out[22] = _mm_packs_epi32(out_22_6, out_22_7);
         out[14] = _mm_packs_epi32(out_14_6, out_14_7);
         out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                           &out[26], &out[6], &out[22],
+                                           &out[14], &out[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step1[16] = _mm_add_epi16(step3[17], step2[16]);
-        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-        step1[19] = _mm_add_epi16(step3[18], step2[19]);
-        step1[20] = _mm_add_epi16(step3[21], step2[20]);
-        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-        step1[23] = _mm_add_epi16(step3[22], step2[23]);
-        step1[24] = _mm_add_epi16(step3[25], step2[24]);
-        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-        step1[27] = _mm_add_epi16(step3[26], step2[27]);
-        step1[28] = _mm_add_epi16(step3[29], step2[28]);
-        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-        step1[31] = _mm_add_epi16(step3[30], step2[31]);
+        step1[16] = ADD_EPI16(step3[17], step2[16]);
+        step1[17] = SUB_EPI16(step2[16], step3[17]);
+        step1[18] = SUB_EPI16(step2[19], step3[18]);
+        step1[19] = ADD_EPI16(step3[18], step2[19]);
+        step1[20] = ADD_EPI16(step3[21], step2[20]);
+        step1[21] = SUB_EPI16(step2[20], step3[21]);
+        step1[22] = SUB_EPI16(step2[23], step3[22]);
+        step1[23] = ADD_EPI16(step3[22], step2[23]);
+        step1[24] = ADD_EPI16(step3[25], step2[24]);
+        step1[25] = SUB_EPI16(step2[24], step3[25]);
+        step1[26] = SUB_EPI16(step2[27], step3[26]);
+        step1[27] = ADD_EPI16(step3[26], step2[27]);
+        step1[28] = ADD_EPI16(step3[29], step2[28]);
+        step1[29] = SUB_EPI16(step2[28], step3[29]);
+        step1[30] = SUB_EPI16(step2[31], step3[30]);
+        step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step1[20], &step1[21], &step1[22], &step1[23],
+            &step1[24], &step1[25], &step1[26], &step1[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Final stage --- outputs indices are bit-reversed.
       {
@@ -1071,6 +1355,18 @@
         out[23] = _mm_packs_epi32(out_23_6, out_23_7);
         out[15] = _mm_packs_epi32(out_15_6, out_15_7);
         out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                           &out[25], &out[7], &out[23],
+                                           &out[15], &out[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
@@ -1139,6 +1435,18 @@
         out[19] = _mm_packs_epi32(out_19_6, out_19_7);
         out[11] = _mm_packs_epi32(out_11_6, out_11_7);
         out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                           &out[29], &out[3], &out[19],
+                                           &out[11], &out[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 #if FDCT32x32_HIGH_PRECISION
       } else {
@@ -1390,15 +1698,22 @@
 
         // TODO(jingning): manually inline k_madd_epi32_ to further hide
         // instruction latency.
-        v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
-        v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
-        v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
-        v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
-        v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
-        v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
-        v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
-        v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
-
+        v[0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+                                            &v[4], &v[5], &v[6], &v[7], &kZero);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
         u[0] = k_packs_epi64(v[0], v[1]);
         u[1] = k_packs_epi64(v[2], v[3]);
         u[2] = k_packs_epi64(v[4], v[5]);
@@ -1469,6 +1784,18 @@
           v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
           v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -1565,6 +1892,16 @@
           v[14] = k_madd_epi32(u[6], k32_m08_p24);
           v[15] = k_madd_epi32(u[7], k32_m08_p24);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1633,6 +1970,14 @@
           out[16] = _mm_packs_epi32(u[2], u[3]);
           out[ 8] = _mm_packs_epi32(u[4], u[5]);
           out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                             &out[8], &out[24]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -1665,6 +2010,16 @@
           v[14] = k_madd_epi32(u[2], k32_p24_p08);
           v[15] = k_madd_epi32(u[3], k32_p24_p08);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1767,6 +2122,16 @@
           v[14] = k_madd_epi32(u[14], k32_m04_p28);
           v[15] = k_madd_epi32(u[15], k32_m04_p28);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1834,6 +2199,14 @@
           out[20] = _mm_packs_epi32(u[2], u[3]);
           out[12] = _mm_packs_epi32(u[4], u[5]);
           out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                             &out[12], &out[28]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
@@ -1912,6 +2285,18 @@
           v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
           v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2024,6 +2409,18 @@
           v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
           v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2151,6 +2548,15 @@
           out[22] = _mm_packs_epi32(u[10], u[11]);
           out[14] = _mm_packs_epi32(u[12], u[13]);
           out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                             &out[26], &out[6], &out[22],
+                                             &out[14], &out[30]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
@@ -2247,6 +2653,18 @@
           v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
           v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2374,6 +2792,15 @@
           out[23] = _mm_packs_epi32(u[10], u[11]);
           out[15] = _mm_packs_epi32(u[12], u[13]);
           out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                             &out[25], &out[7], &out[23],
+                                             &out[15], &out[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
@@ -2435,6 +2862,18 @@
           v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
           v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2562,18 +3001,23 @@
           out[19] = _mm_packs_epi32(u[10], u[11]);
           out[11] = _mm_packs_epi32(u[12], u[13]);
           out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                             &out[29], &out[3], &out[19],
+                                             &out[11], &out[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
       }
-#endif
+#endif  // FDCT32x32_HIGH_PRECISION
       // Transpose the results, do it as four 8x8 transposes.
       {
         int transpose_block;
-        int16_t *output;
-        if (0 == pass) {
-          output = &intermediate[column_start * 32];
-        } else {
-          output = &output_org[column_start * 32];
-        }
+        int16_t *output0 = &intermediate[column_start * 32];
+        tran_low_t *output1 = &output_org[column_start * 32];
         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
           __m128i *this_out = &out[8 * transpose_block];
           // 00 01 02 03 04 05 06 07
@@ -2674,18 +3118,36 @@
           }
           // Note: even though all these stores are aligned, using the aligned
           //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
-          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
-          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
-          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
-          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
-          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
-          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
-          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
-          // Process next 8x8
-          output += 8;
+          if (pass == 0) {
+            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+            // Process next 8x8
+            output0 += 8;
+          } else {
+            storeu_output(&tr2_0, (output1 + 0 * 32));
+            storeu_output(&tr2_1, (output1 + 1 * 32));
+            storeu_output(&tr2_2, (output1 + 2 * 32));
+            storeu_output(&tr2_3, (output1 + 3 * 32));
+            storeu_output(&tr2_4, (output1 + 4 * 32));
+            storeu_output(&tr2_5, (output1 + 5 * 32));
+            storeu_output(&tr2_6, (output1 + 6 * 32));
+            storeu_output(&tr2_7, (output1 + 7 * 32));
+            // Process next 8x8
+            output1 += 8;
+          }
         }
       }
     }
   }
 }  // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C

diff --git a/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
similarity index 65%
rename from libvpx/vp9/encoder/x86/vp9_dct_avx2.c
rename to libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
index 3a19f52..6d9da6a 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c

@@ -8,19 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <immintrin.h>  // AVX2
-#include "vp9/common/vp9_idct.h"  // for cospi constants
-#include "vpx_ports/mem.h"
+#include "./vpx_config.h"
 
-
-#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
-#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
 #undef  FDCT32x32_2D_AVX2
 #undef  FDCT32x32_HIGH_PRECISION
 
-#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
 #define FDCT32x32_HIGH_PRECISION 1
-#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
 #undef  FDCT32x32_2D_AVX2
 #undef  FDCT32x32_HIGH_PRECISION

diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000..69889e2
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h

@@ -0,0 +1,1027 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            -cospi_24_64, cospi_8_64,
+                                            -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+                                               +(DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+  __m128i cmp0, cmp1;
+  int test, overflow;
+#endif
+
+  // Load inputs.
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+  // Check inputs small enough to use optimised code
+  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+  if (test) {
+    vpx_highbd_fdct4x4_c(input, output, stride);
+    return;
+  }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+  // multiply by 16 to give some extra precision
+  in0 = _mm_slli_epi16(in0, 4);
+  in1 = _mm_slli_epi16(in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+    in0 = _mm_add_epi16(in0, mask);
+    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&x0, &x1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    in0 = _mm_shuffle_epi32(x0, 0xD8);
+    in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(in0, in1);
+    const __m128i t1 = SUB_EPI16(in0, in1);
+    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&t0, &t1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&x0, &x1);
+      if (overflow) {
+        vpx_highbd_fdct4x4_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                         &q4, &q5, &q6, &q7);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            vpx_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+  const int16_t *in = input;
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+#if DCT_HIGH_BIT_DEPTH
+    int overflow;
+#endif
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = ADD_EPI16(in00, in15);
+        input1 = ADD_EPI16(in01, in14);
+        input2 = ADD_EPI16(in02, in13);
+        input3 = ADD_EPI16(in03, in12);
+        input4 = ADD_EPI16(in04, in11);
+        input5 = ADD_EPI16(in05, in10);
+        input6 = ADD_EPI16(in06, in09);
+        input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+                                           &input4, &input5, &input6, &input7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = SUB_EPI16(in07, in08);
+        step1_1 = SUB_EPI16(in06, in09);
+        step1_2 = SUB_EPI16(in05, in10);
+        step1_3 = SUB_EPI16(in04, in11);
+        step1_4 = SUB_EPI16(in03, in12);
+        step1_5 = SUB_EPI16(in02, in13);
+        step1_6 = SUB_EPI16(in01, in14);
+        step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                           &step1_2, &step1_3,
+                                           &step1_4, &step1_5,
+                                           &step1_6, &step1_7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        // Add/subtract
+        const __m128i q0 = ADD_EPI16(input0, input7);
+        const __m128i q1 = ADD_EPI16(input1, input6);
+        const __m128i q2 = ADD_EPI16(input2, input5);
+        const __m128i q3 = ADD_EPI16(input3, input4);
+        const __m128i q4 = SUB_EPI16(input3, input4);
+        const __m128i q5 = SUB_EPI16(input2, input5);
+        const __m128i q6 = SUB_EPI16(input1, input6);
+        const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                           &q4, &q5, &q6, &q7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Work on first four results
+        {
+          // Add/subtract
+          const __m128i r0 = ADD_EPI16(q0, q3);
+          const __m128i r1 = ADD_EPI16(q1, q2);
+          const __m128i r2 = SUB_EPI16(q1, q2);
+          const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          {
+            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+          }
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x2(&r0, &r1);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          {
+            // Add/subtract
+            const __m128i x0 = ADD_EPI16(q4, r0);
+            const __m128i x1 = SUB_EPI16(q4, r0);
+            const __m128i x2 = SUB_EPI16(q7, r1);
+            const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+            // Interleave to do the multiply by constants which gets us
+            // into 32 bits.
+            {
+              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+              overflow = check_epi16_overflow_x4(&res02, &res14,
+                                                 &res10, &res06);
+              if (overflow) {
+                vpx_highbd_fdct16x16_c(input, output, stride);
+                return;
+              }
+#endif  // DCT_HIGH_BIT_DEPTH
+            }
+          }
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+                                             &step2_4);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 3
+        {
+          step3_0 = ADD_EPI16(step1_0, step2_3);
+          step3_1 = ADD_EPI16(step1_1, step2_2);
+          step3_2 = SUB_EPI16(step1_1, step2_2);
+          step3_3 = SUB_EPI16(step1_0, step2_3);
+          step3_4 = SUB_EPI16(step1_7, step2_4);
+          step3_5 = SUB_EPI16(step1_6, step2_5);
+          step3_6 = ADD_EPI16(step1_6, step2_5);
+          step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+                                             &step3_2, &step3_3,
+                                             &step3_4, &step3_5,
+                                             &step3_6, &step3_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+                                             &step2_5);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 5
+        {
+          step1_0 = ADD_EPI16(step3_0, step2_1);
+          step1_1 = SUB_EPI16(step3_0, step2_1);
+          step1_2 = ADD_EPI16(step3_3, step2_2);
+          step1_3 = SUB_EPI16(step3_3, step2_2);
+          step1_4 = SUB_EPI16(step3_4, step2_5);
+          step1_5 = ADD_EPI16(step3_4, step2_5);
+          step1_6 = SUB_EPI16(step3_7, step2_6);
+          step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                             &step1_2, &step1_3,
+                                             &step1_4, &step1_5,
+                                             &step1_6, &step1_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      transpose_and_output8x8(&res00, &res01, &res02, &res03,
+                              &res04, &res05, &res06, &res07,
+                              pass, out0, out1);
+      transpose_and_output8x8(&res08, &res09, &res10, &res11,
+                              &res12, &res13, &res14, &res15,
+                              pass, out0 + 8, out1 + 8);
+      if (pass == 0) {
+        out0 += 8*16;
+      } else {
+        out1 += 8*16;
+      }
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16

diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c b/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000..bca72e8
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c

@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0, in1;
+  __m128i tmp;
+  const __m128i zero = _mm_setzero_si128();
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+         (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+         (input +  3 * stride)));
+
+  tmp = _mm_add_epi16(in0, in1);
+  in0 = _mm_unpacklo_epi16(zero, tmp);
+  in1 = _mm_unpackhi_epi16(zero, tmp);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(tmp, zero);
+  in1 = _mm_unpackhi_epi32(tmp, zero);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(tmp, 8);
+
+  in1 = _mm_add_epi32(tmp, in0);
+  in0 = _mm_slli_epi32(in1, 1);
+  store_output(&in0, output);
+}
+
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i u0, u1, sum;
+
+  u0 = _mm_add_epi16(in0, in1);
+  u1 = _mm_add_epi16(in2, in3);
+
+  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  sum = _mm_add_epi16(u0, u1);
+
+  in0 = _mm_add_epi16(in0, in1);
+  in2 = _mm_add_epi16(in2, in3);
+  sum = _mm_add_epi16(sum, in0);
+
+  u0  = _mm_setzero_si128();
+  sum = _mm_add_epi16(sum, in2);
+
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  store_output(&in1, output);
+}
+
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    input += 8 * i;
+    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 1);
+  store_output(&in1, output);
+}
+
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; ++i) {
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 3);
+  store_output(&in1, output);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vpx_fdct4x4_sse2
+#define FDCT8x8_2D vpx_fdct8x8_sse2
+#define FDCT16x16_2D vpx_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000..94d5bef
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h

@@ -0,0 +1,454 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3,
+                                          const __m128i *preg4,
+                                          const __m128i *preg5,
+                                          const __m128i *preg6,
+                                          const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0)
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1)
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15,
+                                           const __m128i *preg16,
+                                           const __m128i *preg17,
+                                           const __m128i *preg18,
+                                           const __m128i *preg19,
+                                           const __m128i *preg20,
+                                           const __m128i *preg21,
+                                           const __m128i *preg22,
+                                           const __m128i *preg23,
+                                           const __m128i *preg24,
+                                           const __m128i *preg25,
+                                           const __m128i *preg26,
+                                           const __m128i *preg27,
+                                           const __m128i *preg28,
+                                           const __m128i *preg29,
+                                           const __m128i *preg30,
+                                           const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *zero) {
+  __m128i minus_one = _mm_set1_epi32(-1);
+  // Check for overflows
+  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+  __m128i reg0_top_dwords = _mm_shuffle_epi32(
+      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords = _mm_shuffle_epi32(
+      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords = _mm_shuffle_epi32(
+      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords = _mm_shuffle_epi32(
+      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+  int overflow_01 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
+                                          zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *preg16,
+                                            const __m128i *preg17,
+                                            const __m128i *preg18,
+                                            const __m128i *preg19,
+                                            const __m128i *preg20,
+                                            const __m128i *preg21,
+                                            const __m128i *preg22,
+                                            const __m128i *preg23,
+                                            const __m128i *preg24,
+                                            const __m128i *preg25,
+                                            const __m128i *preg26,
+                                            const __m128i *preg27,
+                                            const __m128i *preg28,
+                                            const __m128i *preg29,
+                                            const __m128i *preg30,
+                                            const __m128i *preg31,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+        if (!overflow) {
+          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
+                                              zero);
+          if (!overflow) {
+            overflow = k_check_epi32_overflow_4(preg20, preg21,
+                                                preg22, preg23, zero);
+            if (!overflow) {
+              overflow = k_check_epi32_overflow_4(preg24, preg25,
+                                                  preg26, preg27, zero);
+              if (!overflow) {
+                overflow = k_check_epi32_overflow_4(preg28, preg29,
+                                                    preg30, preg31, zero);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0,
+                                       const __m128i *pin1,
+                                       const __m128i *pmultiplier,
+                                       const __m128i *prounding,
+                                       const int shift) {
+  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+  const __m128i v0 = _mm_add_epi32(u0, *prounding);
+  const __m128i v1 = _mm_add_epi32(u1, *prounding);
+  const __m128i w0 = _mm_srai_epi32(v0, shift);
+  const __m128i w1 = _mm_srai_epi32(v1, shift);
+  return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+    const __m128i *pin00, const __m128i *pin01,
+    const __m128i *pin02, const __m128i *pin03,
+    const __m128i *pin04, const __m128i *pin05,
+    const __m128i *pin06, const __m128i *pin07,
+    const int pass, int16_t* out0_ptr,
+    tran_low_t* out1_ptr) {
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 54 54 55 55 56 56 57 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 21 36
+  // 44 54 64 74 45 55 61 76
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+  if (pass == 0) {
+    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+  } else {
+    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_X86_FWD_TXFM_SSE2_H_

diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000..78a1dbb
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm

@@ -0,0 +1,183 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 1
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+%if %1 == 0
+  SUM_SUB            0,  1,  9
+%endif
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+%if %1 == 0
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+%else
+  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
+  SWAP               0,  1
+%endif
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D  0
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D  1
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif

diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
new file mode 100644
index 0000000..b12d29c
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm

@@ -0,0 +1,476 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_MMX sse
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, one
+  mov                 oned, 0x0001
+  pxor                  m1, m1
+  movd                  m3, oned
+  pshufw                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  mova                  m5, [leftq]
+  mova                  m6, [leftq+16]
+  mova                  m7, [leftq+32]
+  mova                  m8, [leftq+48]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  paddw                 m0, m5
+  paddw                 m0, m6
+  paddw                 m0, m7
+  paddw                 m0, m8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+%endif
+
+INIT_MMX sse
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  movq                  m0, [aboveq]
+  pshufw                m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  movd                  m3, oned
+  movd                  m4, bpsd
+  pshufw                m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  mova                  m2, m3
+  psllw                 m3, m4
+  add                leftq, 8
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movq                  m1, [leftq+lineq*4]
+  movq                  m2, [leftq+lineq*4+2]
+  pshufw                m1, m1, 0x0
+  pshufw                m2, m2, 0x0
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m1
+  movq    [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  mova                  m0, [aboveq]
+  pshuflw               m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m3, m3
+  pxor                  m4, m4
+  pinsrw                m3, oned, 0
+  pinsrw                m4, bpsd, 0
+  pshuflw               m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m3, m3
+  mov                lineq, -4
+  mova                  m2, m3
+  punpcklqdq            m1, m1
+  psllw                 m3, m4
+  add                leftq, 16
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movd                  m1, [leftq+lineq*4]
+  movd                  m2, [leftq+lineq*4+2]
+  pshuflw               m1, m1, 0x0
+  pshuflw               m2, m2, 0x0
+  punpcklqdq            m1, m1
+  punpcklqdq            m2, m2
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  mova      [dstq          ], m1
+  mova      [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
+  movd                  m2, [aboveq-2]
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  pshuflw               m2, m2, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m7, m7
+  pxor                  m8, m8
+  pinsrw                m7, oned, 0
+  pinsrw                m8, bpsd, 0
+  pshuflw               m7, m7, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m7, m7
+  mov                lineq, -8
+  mova                  m5, m7
+  punpcklqdq            m2, m2
+  psllw                 m7, m8
+  add                leftq, 32
+  psubw                 m7, m5 ; max possible value
+  pxor                  m8, m8 ; min possible value
+  psubw                 m0, m2
+  psubw                 m1, m2
+.loop:
+  movd                  m2, [leftq+lineq*4]
+  movd                  m3, [leftq+lineq*4+2]
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m4, m2, m0
+  paddw                 m5, m3, m0
+  paddw                 m2, m1
+  paddw                 m3, m1
+  ;Clamp to the bit-depth
+  pminsw                m4, m7
+  pminsw                m5, m7
+  pminsw                m2, m7
+  pminsw                m3, m7
+  pmaxsw                m4, m8
+  pmaxsw                m5, m8
+  pmaxsw                m2, m8
+  pmaxsw                m3, m8
+  ;Store the values
+  mova   [dstq             ], m4
+  mova   [dstq+strideq*2   ], m5
+  mova   [dstq          +16], m2
+  mova   [dstq+strideq*2+16], m3
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+  movd                  m0, [aboveq-2]
+  mova                  m1, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  pshuflw               m0, m0, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                 m10, m10
+  pxor                 m11, m11
+  pinsrw               m10, oned, 0
+  pinsrw               m11, bpsd, 0
+  pshuflw              m10, m10, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq           m10, m10
+  mov                lineq, -16
+  mova                  m5, m10
+  punpcklqdq            m0, m0
+  psllw                m10, m11
+  add                leftq, 64
+  psubw                m10, m5 ; max possible value
+  pxor                 m11, m11 ; min possible value
+  psubw                 m1, m0
+  psubw                 m2, m0
+  psubw                 m3, m0
+  psubw                 m4, m0
+.loop:
+  movd                  m5, [leftq+lineq*4]
+  movd                  m6, [leftq+lineq*4+2]
+  pshuflw               m5, m5, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m5, m5
+  punpcklqdq            m6, m6
+  paddw                 m7, m5, m1
+  paddw                 m8, m5, m2
+  paddw                 m9, m5, m3
+  paddw                 m5, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m5, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m5, m11
+  ;Store these values
+  mova   [dstq           ], m7
+  mova   [dstq        +16], m8
+  mova   [dstq        +32], m9
+  mova   [dstq        +48], m5
+  paddw                 m7, m6, m1
+  paddw                 m8, m6, m2
+  paddw                 m9, m6, m3
+  paddw                 m6, m4
+  ;Clamp these values to the bit-depth
+  pminsw                m7, m10
+  pminsw                m8, m10
+  pminsw                m9, m10
+  pminsw                m6, m10
+  pmaxsw                m7, m11
+  pmaxsw                m8, m11
+  pmaxsw                m9, m11
+  pmaxsw                m6, m11
+  ;Store these values
+  mova   [dstq+strideq*2   ], m7
+  mova   [dstq+strideq*2+16], m8
+  mova   [dstq+strideq*2+32], m9
+  mova   [dstq+strideq*2+48], m6
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif

diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000..c4fd5e1
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -0,0 +1,1214 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+  __m128i ubounded;
+  __m128i lbounded;
+  __m128i retval;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i t80, max, min;
+
+  if (bd == 8) {
+    t80 = _mm_set1_epi16(0x80);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+  } else if (bd == 10) {
+    t80 = _mm_set1_epi16(0x200);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+  } else {  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+  }
+
+  min = _mm_subs_epi16(zero, t80);
+
+  ubounded = _mm_cmpgt_epi16(value, max);
+  lbounded = _mm_cmplt_epi16(value, min);
+  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+  ubounded = _mm_and_si128(ubounded, max);
+  lbounded = _mm_and_si128(lbounded, min);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_or_si128(retval, lbounded);
+  return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+                                                   int p,
+                                                   const uint8_t *_blimit,
+                                                   const uint8_t *_limit,
+                                                   const uint8_t *_thresh,
+                                                   int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i blimit, limit, thresh;
+  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+  __m128i ps1, qs1, ps0, qs0;
+  __m128i abs_p0q0, abs_p1q1, ffff, work;
+  __m128i filt, work_a, filter1, filter2;
+  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+  __m128i flat2_q0, flat2_p0;
+  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+  __m128i pixelFilter_p, pixelFilter_q;
+  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+  __m128i t4, t3, t80, t1;
+  __m128i eight, four;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+  }
+
+  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+  //  highbd_filter_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+                                    _mm_subs_epu16(p0, p1)),
+                       _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                    _mm_subs_epu16(q0, q1)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+
+  // lp filter
+  // highbd_filter4
+  t4 = _mm_set1_epi16(4);
+  t3 = _mm_set1_epi16(3);
+  if (bd == 8)
+    t80 = _mm_set1_epi16(0x80);
+  else if (bd == 10)
+    t80 = _mm_set1_epi16(0x200);
+  else  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+
+  t1 = _mm_set1_epi16(0x1);
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  filt = _mm_and_si128(filt, mask);
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+      t80);
+  ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+  qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+      t80);
+  ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+      t80);
+
+  // end highbd_filter4
+  // loopfilter done
+
+  // highbd_flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  flat = _mm_max_epi16(work, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  // end flat_mask4
+
+  // flat & mask = flat && mask (as used in filter8)
+  // (because, in both vars, each block of 16 either all 1s or all 0s)
+  flat = _mm_and_si128(flat, mask);
+
+  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+  // but referred to as p0-p4 & q0-q4 in fn)
+  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+                                     _mm_subs_epu16(p0, p4)),
+                        _mm_or_si128(_mm_subs_epu16(q4, q0),
+                                     _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+                                    _mm_subs_epu16(p0, p5)),
+                       _mm_or_si128(_mm_subs_epu16(q5, q0),
+                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+                                    _mm_subs_epu16(p0, p6)),
+                       _mm_or_si128(_mm_subs_epu16(q6, q0),
+                                    _mm_subs_epu16(q0, q6)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+                                    _mm_subs_epu16(p0, p7)),
+                       _mm_or_si128(_mm_subs_epu16(q7, q0),
+                                    _mm_subs_epu16(q0, q7)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  if (bd == 8)
+    flat2 = _mm_subs_epu16(flat2, one);
+  else if (bd == 10)
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+  flat2 = _mm_cmpeq_epi16(flat2, zero);
+  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  // end highbd_flat_mask5
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
+  eight = _mm_set1_epi16(8);
+  four = _mm_set1_epi16(4);
+
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+                                _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+                                _mm_add_epi16(q4, q3));
+
+  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                      pixelFilter_q));
+  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                       _mm_add_epi16(pixetFilter_p2p1p0,
+                                                     pixetFilter_q2q1q0));
+  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(q3, q0)), 3);
+
+  sum_p7 = _mm_add_epi16(p7, p7);
+  sum_q7 = _mm_add_epi16(q7, q7);
+  sum_p3 = _mm_add_epi16(p3, p3);
+  sum_q3 = _mm_add_epi16(q3, q3);
+
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+  flat2_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+  flat2_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q1)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  sum_p3 = _mm_add_epi16(sum_p3, p3);
+  sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q2)), 4);
+
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q2)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q3)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q4)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q5)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q6)), 4);
+
+  //  wide flat
+  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  //  highbd_filter8
+  p2 = _mm_andnot_si128(flat, p2);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p2 = _mm_and_si128(flat, flat_p2);
+  //  when (flat && mask)
+  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
+  q2 = _mm_andnot_si128(flat, q2);
+  flat_q2 = _mm_and_si128(flat, flat_q2);
+  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
+
+  ps1 = _mm_andnot_si128(flat, ps1);
+  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p1 = _mm_and_si128(flat, flat_p1);
+  //  when (flat && mask)
+  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
+  qs1 = _mm_andnot_si128(flat, qs1);
+  flat_q1 = _mm_and_si128(flat, flat_q1);
+  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
+
+  ps0 = _mm_andnot_si128(flat, ps0);
+  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p0 = _mm_and_si128(flat, flat_p0);
+  //  when (flat && mask)
+  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
+  qs0 = _mm_andnot_si128(flat, qs0);
+  flat_q0 = _mm_and_si128(flat, flat_q0);
+  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
+  // end highbd_filter8
+
+  // highbd_filter16
+  p6 = _mm_andnot_si128(flat2, p6);
+  //  p6 remains unchanged if !(flat2 && flat && mask)
+  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+  //  get values for when (flat2 && flat && mask)
+  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
+  q6 = _mm_andnot_si128(flat2, q6);
+  //  q6 remains unchanged if !(flat2 && flat && mask)
+  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+  //  get values for when (flat2 && flat && mask)
+  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
+  _mm_store_si128((__m128i *)(s - 7 * p), p6);
+  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+  p5 = _mm_andnot_si128(flat2, p5);
+  //  p5 remains unchanged if !(flat2 && flat && mask)
+  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+  //  get values for when (flat2 && flat && mask)
+  p5 = _mm_or_si128(p5, flat2_p5);
+  //  full list of p5 values
+  q5 = _mm_andnot_si128(flat2, q5);
+  //  q5 remains unchanged if !(flat2 && flat && mask)
+  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+  //  get values for when (flat2 && flat && mask)
+  q5 = _mm_or_si128(q5, flat2_q5);
+  //  full list of q5 values
+  _mm_store_si128((__m128i *)(s - 6 * p), p5);
+  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+  p4 = _mm_andnot_si128(flat2, p4);
+  //  p4 remains unchanged if !(flat2 && flat && mask)
+  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+  //  get values for when (flat2 && flat && mask)
+  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
+  q4 = _mm_andnot_si128(flat2, q4);
+  //  q4 remains unchanged if !(flat2 && flat && mask)
+  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+  //  get values for when (flat2 && flat && mask)
+  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
+  _mm_store_si128((__m128i *)(s - 5 * p), p4);
+  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+  p3 = _mm_andnot_si128(flat2, p3);
+  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+  //  get values for when (flat2 && flat && mask)
+  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
+  q3 = _mm_andnot_si128(flat2, q3);
+  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+  //  get values for when (flat2 && flat && mask)
+  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
+  _mm_store_si128((__m128i *)(s - 4 * p), p3);
+  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+  p2 = _mm_andnot_si128(flat2, p2);
+  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+  //  get values for when (flat2 && flat && mask)
+  p2 = _mm_or_si128(p2, flat2_p2);
+  //  full list of p2 values
+  q2 = _mm_andnot_si128(flat2, q2);
+  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+  //  get values for when (flat2 && flat && mask)
+  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+  p1 = _mm_andnot_si128(flat2, p1);
+  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+  //  get values for when (flat2 && flat && mask)
+  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
+  q1 = _mm_andnot_si128(flat2, q1);
+  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+  //  get values for when (flat2 && flat && mask)
+  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+  p0 = _mm_andnot_si128(flat2, p0);
+  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+  //  get values for when (flat2 && flat && mask)
+  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
+  q0 = _mm_andnot_si128(flat2, q0);
+  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+  //  get values for when (flat2 && flat && mask)
+  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+                                                    int p,
+                                                    const uint8_t *_blimit,
+                                                    const uint8_t *_limit,
+                                                    const uint8_t *_thresh,
+                                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+                                         bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh,
+                                       int count, int bd) {
+  if (count == 1)
+    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  else
+    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i workp_a, workp_b, workp_shft;
+
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_set1_epi16(0x200);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_set1_epi16(0x800);
+  }
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+
+  // filter_mask and hev_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                          _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                          _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                          _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                          _mm_subs_epu16(q1, p1));
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(abs_q1q0, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
+  filt = signed_char_clamp_bd_sse2(filt, bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = _mm_adds_epi16(filt, t4);
+  filter2 = _mm_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+  filter1 = _mm_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+  filter2 = _mm_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm_andnot_si128(hev, filt);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q0 = _mm_load_si128((__m128i *)flat_oq0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q0 = _mm_and_si128(flat, q0);
+  q0 = _mm_or_si128(work_a, q0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q1 = _mm_load_si128((__m128i *)flat_oq1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q1 = _mm_and_si128(flat, q1);
+  q1 = _mm_or_si128(work_a, q1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q2 = _mm_load_si128((__m128i *)flat_oq2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q2 = _mm_and_si128(flat, q2);
+  q2 = _mm_or_si128(work_a, q2);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p0 = _mm_load_si128((__m128i *)flat_op0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p0 = _mm_and_si128(flat, p0);
+  p0 = _mm_or_si128(work_a, p0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p1 = _mm_load_si128((__m128i *)flat_op1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p1 = _mm_and_si128(flat, p1);
+  p1 = _mm_or_si128(work_a, p1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p2 = _mm_load_si128((__m128i *)flat_op2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p2 = _mm_and_si128(flat, p2);
+  p2 = _mm_or_si128(work_a, p2);
+
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s + 0 * p), q0);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+                                   1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                                        _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                        _mm_subs_epu16(q0, q1));
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                                  _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                                  _mm_subs_epu16(q1, p1));
+  __m128i work;
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  __m128i tff80;
+  __m128i tffe0;
+  __m128i t1f;
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i t7f;
+  // equivalent to shifting 0x7f left by bitdepth - 8
+  // and setting new bits to 1
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+    tff80 = _mm_set1_epi16(0xff80);
+    tffe0 = _mm_set1_epi16(0xffe0);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+  }
+
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // filter4
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm_srli_epi16(filter1, 3);
+  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter2);
+  filter2 = _mm_srli_epi16(filter2, 3);
+  work_a = _mm_and_si128(work_a, tffe0);
+  filter2 = _mm_and_si128(filter2, t1f);
+  filter2 = _mm_or_si128(filter2, work_a);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  work_a = _mm_cmpgt_epi16(zero, filt);
+  filt = _mm_srli_epi16(filt, 1);
+  work_a = _mm_and_si128(work_a, tff80);
+  filt = _mm_and_si128(filt, t7f);
+  filt = _mm_or_si128(filt, work_a);
+
+  filt = _mm_andnot_si128(hev, filt);
+
+  q0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+  q1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  p0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+  p1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+                                   bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+                                    uint16_t *dst[], int out_p,
+                                    int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    uint16_t *in = src[idx8x8];
+    uint16_t *out = dst[idx8x8];
+
+    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13
+    x0 = _mm_unpacklo_epi16(p0, p1);
+    // 20 30 21 31 22 32 23 33
+    x1 = _mm_unpacklo_epi16(p2, p3);
+    // 40 50 41 51 42 52 43 53
+    x2 = _mm_unpacklo_epi16(p4, p5);
+    // 60 70 61 71 62 72 63 73
+    x3 = _mm_unpacklo_epi16(p6, p7);
+    // 00 10 20 30 01 11 21 31
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 40 50 60 70 41 51 61 71
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 00 10 20 30 40 50 60 70
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 01 11 21 31 41 51 61 71
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    // 00 10 20 30 40 50 60 70
+    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    // 01 11 21 31 41 51 61 71
+
+    // 02 12 22 32 03 13 23 33
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 42 52 62 72 43 53 63 73
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 02 12 22 32 42 52 62 72
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    // 02 12 22 32 42 52 62 72
+    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    // 03 13 23 33 43 53 63 73
+
+    // 04 14 05 15 06 16 07 17
+    x0 = _mm_unpackhi_epi16(p0, p1);
+    // 24 34 25 35 26 36 27 37
+    x1 = _mm_unpackhi_epi16(p2, p3);
+    // 44 54 45 55 46 56 47 57
+    x2 = _mm_unpackhi_epi16(p4, p5);
+    // 64 74 65 75 66 76 67 77
+    x3 = _mm_unpackhi_epi16(p6, p7);
+    // 04 14 24 34 05 15 25 35
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 44 54 64 74 45 55 65 75
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 04 14 24 34 44 54 64 74
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 05 15 25 35 45 55 65 75
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    // 04 14 24 34 44 54 64 74
+    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    // 05 15 25 35 45 55 65 75
+
+    // 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 06 16 26 36 46 56 66 76
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    // 06 16 26 36 46 56 66 76
+    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+                                        int in_p, uint16_t *out, int out_p) {
+  uint16_t *src0[1];
+  uint16_t *src1[1];
+  uint16_t *dest0[1];
+  uint16_t *dest1[1];
+  src0[0] = in0;
+  src1[0] = in1;
+  dest0[0] = out;
+  dest1[0] = out + 8;
+  highbd_transpose(src0, in_p, dest0, out_p, 1);
+  highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
+                                     int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  highbd_transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+                                         thresh, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+                                          int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+  //  Transpose 16x16
+  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  //  Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                          thresh, bd);
+
+  //  Transpose back
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}

diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000..fd46bef
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
+                                intptr_t count,
+                                int skip_block,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr,
+                                uint16_t *eob_ptr,
+                                const int16_t *scan,
+                                const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = ((int)count / 4) - 1; i >= 0; i--) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (test == 0xffff)
+        non_zero_regs--;
+      else
+        break;
+    }
+
+    // Quantization pass:
+    for (i = 0; i < non_zero_regs; i++) {
+      __m128i coeffs, coeffs_sign, tmp1, tmp2;
+      int test;
+      int abs_coeff[4];
+      int coeff_sign[4];
+
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      coeffs_sign = _mm_srai_epi32(coeffs, 31);
+      coeffs = _mm_sub_epi32(
+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+      tmp1 = _mm_or_si128(tmp1, tmp2);
+      test = _mm_movemask_epi8(tmp1);
+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+      for (j = 0; j < 4; j++) {
+        if (test & (1 << (4 * j))) {
+          int k = 4 * i + j;
+          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
+          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+          const uint32_t abs_qcoeff =
+              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+          if (abs_qcoeff)
+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+        }
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+                                      intptr_t n_coeffs,
+                                      int skip_block,
+                                      const int16_t *zbin_ptr,
+                                      const int16_t *round_ptr,
+                                      const int16_t *quant_ptr,
+                                      const int16_t *quant_shift_ptr,
+                                      tran_low_t *qcoeff_ptr,
+                                      tran_low_t *dqcoeff_ptr,
+                                      const int16_t *dequant_ptr,
+                                      uint16_t *eob_ptr,
+                                      const int16_t *scan,
+                                      const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp,
+                           zbin1_tmp,
+                           zbin1_tmp,
+                           zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs / 4; i++) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (!(test & 0xf))
+        idx_arr[idx++] = i * 4;
+      if (!(test & 0xf0))
+        idx_arr[idx++] = i * 4 + 1;
+      if (!(test & 0xf00))
+        idx_arr[idx++] = i * 4 + 2;
+      if (!(test & 0xf000))
+        idx_arr[idx++] = i * 4 + 3;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = idx_arr[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

diff --git a/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000..6c2a61e
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm

@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4

diff --git a/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000..bc4b28d
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm

@@ -0,0 +1,363 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2

diff --git a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000..93df92a
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm

@@ -0,0 +1,1039 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*2]
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*4]
+%else
+  lea                srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                x_offset, y_offset, dst, dst_stride, height, \
+                                sse, g_bilin_filter, g_pw_8
+      %define block_height heightd
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define block_height heightd
+      %define sec_str sec_strideq
+      %else
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                              x_offset, y_offset, dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   block_height, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_2STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1

diff --git a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000..923418a
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm

@@ -0,0 +1,313 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vpx_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+sym(vpx_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+sym(vpx_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000..b45331c
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c

@@ -0,0 +1,581 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   int w, int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
+                         block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+#if CONFIG_USE_X86INC
+#define DECL(w, opt) \
+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                                 ptrdiff_t src_stride, \
+                                                 int x_offset, int y_offset, \
+                                                 const uint16_t *dst, \
+                                                 ptrdiff_t dst_stride, \
+                                                 int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+// TODO(johannkoenig): enable the ssse3 or delete
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                          int src_stride, \
+                                                          int x_offset, \
+                                                          int y_offset, \
+                                                          const uint8_t *dst8, \
+                                                          int dst_stride, \
+                                                          uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC

diff --git a/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libvpx/vpx_dsp/x86/intrapred_sse2.asm
new file mode 100644
index 0000000..22b5731
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/intrapred_sse2.asm

@@ -0,0 +1,667 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+INIT_MMX sse
+cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  punpckldq             m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_MMX sse
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_MMX sse
+cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movd                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshufw                m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -2
+  add                leftq, 4
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshufw                m2, m2, 0x0
+  pshufw                m3, m3, 0x0
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m2
+  packuswb              m3, m3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movq                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -4
+  punpcklqdq            m2, m2
+  add                leftq, 8
+  psubw                 m0, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m2, m0
+  paddw                 m3, m0
+  packuswb              m2, m3
+  movq      [dstq        ], m2
+  movhps    [dstq+strideq], m2
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpckhbw             m4, m0, m1
+  punpcklbw             m0, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -8
+  punpcklqdq            m2, m2
+  add                leftq, 16
+  psubw                 m0, m2
+  psubw                 m4, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m3, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m3, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m3, m3, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m3, m3
+  paddw                 m5, m2, m0
+  paddw                 m6, m3, m0
+  paddw                 m2, m4
+  paddw                 m3, m4
+  packuswb              m5, m2
+  packuswb              m6, m3
+  mova      [dstq        ], m5
+  mova      [dstq+strideq], m6
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  mova                  m4, [aboveq+16]
+  punpcklbw             m2, m1
+  punpckhbw             m3, m0, m1
+  punpckhbw             m5, m4, m1
+  punpcklbw             m0, m1
+  punpcklbw             m4, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -16
+  punpcklqdq            m2, m2
+  add                leftq, 32
+  psubw                 m0, m2
+  psubw                 m3, m2
+  psubw                 m4, m2
+  psubw                 m5, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  movd                  m6, [leftq+lineq*2+1]
+  punpcklbw             m2, m1
+  punpcklbw             m6, m1
+  pshuflw               m2, m2, 0x0
+  pshuflw               m6, m6, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m6, m6
+  paddw                 m7, m2, m0
+  paddw                 m8, m2, m3
+  paddw                 m9, m2, m4
+  paddw                 m2, m5
+  packuswb              m7, m8
+  packuswb              m9, m2
+  paddw                 m2, m6, m0
+  paddw                 m8, m6, m3
+  mova   [dstq           ], m7
+  paddw                 m7, m6, m4
+  paddw                 m6, m5
+  mova   [dstq        +16], m9
+  packuswb              m2, m8
+  packuswb              m7, m6
+  mova   [dstq+strideq   ], m2
+  mova   [dstq+strideq+16], m7
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
+%endif

diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
similarity index 100%
rename from libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
rename to libvpx/vpx_dsp/x86/intrapred_ssse3.asm


diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
new file mode 100644
index 0000000..f3af68f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c

@@ -0,0 +1,4054 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16(
+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+  input1 = _mm_unpackhi_epi32(input0, input0);
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input3 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, input2);
+    d2 = _mm_add_epi16(d2, input3);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store input0
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store input2
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    // store input3
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void idct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+
+  transpose_4x4(in);
+  // stage 1
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
+
+  // stage 2
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void iadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8], in7;
+
+  transpose_4x4(in);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(v[3], v[4]);
+  u[2] = v[2];
+  u[3] = _mm_add_epi32(u[0], u[1]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_add_epi32(u[3], v[5]);
+  u[6] = _mm_sub_epi32(u[5], u[4]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+  }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+                         out0, out1, out2, out3) \
+  {                                              \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+    \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+  }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+  {                                            \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+  }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+              out0, out1, out2, out3, out4, out5, out6, out7)  \
+  { \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+  }
+
+void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+  // 2-D
+  for (i = 0; i < 2; i++) {
+    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+                  in0, in1, in2, in3, in4, in5, in6, in7);
+
+    // 4-stage 1D idct8x8
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+          in0, in1, in2, in3, in4, in5, in6, in7);
+  }
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest + 0 * stride, dc_value);
+  RECON_AND_STORE(dest + 1 * stride, dc_value);
+  RECON_AND_STORE(dest + 2 * stride, dc_value);
+  RECON_AND_STORE(dest + 3 * stride, dc_value);
+  RECON_AND_STORE(dest + 4 * stride, dc_value);
+  RECON_AND_STORE(dest + 5 * stride, dc_value);
+  RECON_AND_STORE(dest + 6 * stride, dc_value);
+  RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void idct8_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                in0, in1, in2, in3, in4, in5, in6, in7);
+
+  // 4-stage 1D idct8x8
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+void iadst8_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // Rows. Load 4-row input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+
+  // 8x4 Transpose
+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+  // Stage1
+  {
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+  }
+
+  // Stage2
+  {
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+    stp2_4 = tmp0;
+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+        in0, in1, in2, in3, in4, in5, in6, in7);
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+    \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  } \
+  \
+  /* Stage4 */ \
+  { \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+    \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+    \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+    \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+  } \
+    \
+  /* Stage6 */ \
+  { \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+    \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  }
+
+#define IDCT16_10 \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
+void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[16], l[16], r[16], *curr1;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+    // 1-D idct
+
+    // Load input data.
+    in[0] = _mm_load_si128((const __m128i *)input);
+    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+
+    IDCT16
+
+    // Stage7
+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    curr1 = r;
+    input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+    int j;
+    // 1-D idct
+    array_transpose_8x8(l + i * 8, in);
+    array_transpose_8x8(r + i * 8, in + 8);
+
+    IDCT16
+
+    // 2-D
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest +  0 * stride, dc_value);
+    RECON_AND_STORE(dest +  1 * stride, dc_value);
+    RECON_AND_STORE(dest +  2 * stride, dc_value);
+    RECON_AND_STORE(dest +  3 * stride, dc_value);
+    RECON_AND_STORE(dest +  4 * stride, dc_value);
+    RECON_AND_STORE(dest +  5 * stride, dc_value);
+    RECON_AND_STORE(dest +  6 * stride, dc_value);
+    RECON_AND_STORE(dest +  7 * stride, dc_value);
+    RECON_AND_STORE(dest +  8 * stride, dc_value);
+    RECON_AND_STORE(dest +  9 * stride, dc_value);
+    RECON_AND_STORE(dest + 10 * stride, dc_value);
+    RECON_AND_STORE(dest + 11 * stride, dc_value);
+    RECON_AND_STORE(dest + 12 * stride, dc_value);
+    RECON_AND_STORE(dest + 13 * stride, dc_value);
+    RECON_AND_STORE(dest + 14 * stride, dc_value);
+    RECON_AND_STORE(dest + 15 * stride, dc_value);
+    dest += 8;
+  }
+}
+
+static void iadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void idct16_8col(__m128i *in) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  t[5] = _mm_packs_epi32(u[0], u[1]);
+  t[6] = _mm_packs_epi32(u[2], u[3]);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  s[8] = t[8];
+  s[9] = t[9];
+
+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  s[10] = _mm_packs_epi32(u[0], u[1]);
+  s[13] = _mm_packs_epi32(u[2], u[3]);
+  s[11] = _mm_packs_epi32(u[4], u[5]);
+  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[14] = t[14];
+  s[15] = t[15];
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void idct16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
+}
+
+void iadst16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
+}
+
+void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i in[16], l[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+    stp2_10 = _mm_packs_epi32(tmp0, zero);
+    stp2_13 = _mm_packs_epi32(tmp2, zero);
+    stp2_11 = _mm_packs_epi32(tmp4, zero);
+    stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    int j;
+    array_transpose_4X8(l + 8 * i, in);
+
+    IDCT16_10
+
+    // Stage7
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = _mm_load_si128((const __m128i *) input); \
+    input += 8; \
+  }  \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[32];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 32));
+  in[2] = _mm_load_si128((const __m128i *)(input + 64));
+  in[3] = _mm_load_si128((const __m128i *)(input + 96));
+  in[4] = _mm_load_si128((const __m128i *)(input + 128));
+  in[5] = _mm_load_si128((const __m128i *)(input + 160));
+  in[6] = _mm_load_si128((const __m128i *)(input + 192));
+  in[7] = _mm_load_si128((const __m128i *)(input + 224));
+
+  for (i = 8; i < 32; ++i) {
+    in[i] = _mm_setzero_si128();
+  }
+
+  array_transpose_8x8(in, in);
+  // TODO(hkuang): Following transposes are unnecessary. But remove them will
+  // lead to performance drop on some devices.
+  array_transpose_8x8(in + 8, in + 8);
+  array_transpose_8x8(in + 16, in + 16);
+  array_transpose_8x8(in + 24, in + 24);
+
+  IDCT32_34
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
+    int j;
+    const __m128i zero = _mm_setzero_si128();
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + i * 8, in);
+    IDCT32_34
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[128], zero_idx[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  for (i = 0; i < 4; i++) {
+    i32 = (i << 5);
+    // First 1-D idct
+    // Load input data.
+    LOAD_DQCOEFF(in[0], input);
+    LOAD_DQCOEFF(in[8], input);
+    LOAD_DQCOEFF(in[16], input);
+    LOAD_DQCOEFF(in[24], input);
+    LOAD_DQCOEFF(in[1], input);
+    LOAD_DQCOEFF(in[9], input);
+    LOAD_DQCOEFF(in[17], input);
+    LOAD_DQCOEFF(in[25], input);
+    LOAD_DQCOEFF(in[2], input);
+    LOAD_DQCOEFF(in[10], input);
+    LOAD_DQCOEFF(in[18], input);
+    LOAD_DQCOEFF(in[26], input);
+    LOAD_DQCOEFF(in[3], input);
+    LOAD_DQCOEFF(in[11], input);
+    LOAD_DQCOEFF(in[19], input);
+    LOAD_DQCOEFF(in[27], input);
+
+    LOAD_DQCOEFF(in[4], input);
+    LOAD_DQCOEFF(in[12], input);
+    LOAD_DQCOEFF(in[20], input);
+    LOAD_DQCOEFF(in[28], input);
+    LOAD_DQCOEFF(in[5], input);
+    LOAD_DQCOEFF(in[13], input);
+    LOAD_DQCOEFF(in[21], input);
+    LOAD_DQCOEFF(in[29], input);
+    LOAD_DQCOEFF(in[6], input);
+    LOAD_DQCOEFF(in[14], input);
+    LOAD_DQCOEFF(in[22], input);
+    LOAD_DQCOEFF(in[30], input);
+    LOAD_DQCOEFF(in[7], input);
+    LOAD_DQCOEFF(in[15], input);
+    LOAD_DQCOEFF(in[23], input);
+    LOAD_DQCOEFF(in[31], input);
+
+    // checking if all entries are zero
+    zero_idx[0] = _mm_or_si128(in[0], in[1]);
+    zero_idx[1] = _mm_or_si128(in[2], in[3]);
+    zero_idx[2] = _mm_or_si128(in[4], in[5]);
+    zero_idx[3] = _mm_or_si128(in[6], in[7]);
+    zero_idx[4] = _mm_or_si128(in[8], in[9]);
+    zero_idx[5] = _mm_or_si128(in[10], in[11]);
+    zero_idx[6] = _mm_or_si128(in[12], in[13]);
+    zero_idx[7] = _mm_or_si128(in[14], in[15]);
+    zero_idx[8] = _mm_or_si128(in[16], in[17]);
+    zero_idx[9] = _mm_or_si128(in[18], in[19]);
+    zero_idx[10] = _mm_or_si128(in[20], in[21]);
+    zero_idx[11] = _mm_or_si128(in[22], in[23]);
+    zero_idx[12] = _mm_or_si128(in[24], in[25]);
+    zero_idx[13] = _mm_or_si128(in[26], in[27]);
+    zero_idx[14] = _mm_or_si128(in[28], in[29]);
+    zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    }
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    IDCT32
+
+    // 1_D: Store 32 intermediate results for each 8x32 block.
+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+  }
+  for (i = 0; i < 4; i++) {
+    // Second 1-D idct
+    j = i << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    IDCT32
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 4; ++i) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      RECON_AND_STORE(dest + j * stride, dc_value);
+    }
+    dest += 8;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct4_c(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vpx_highbd_idct4_c(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
similarity index 83%
rename from libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
rename to libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index 0f179b4..658a914 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,12 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
 #include <emmintrin.h>  // SSE2
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/inv_txfm.h"
 
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
@@ -115,7 +116,6 @@
       d0 = _mm_add_epi16(in_x, d0); \
       d0 = _mm_packus_epi16(d0, d0); \
       _mm_storel_epi64((__m128i *)(dest), d0); \
-      dest += stride; \
   }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -156,20 +156,29 @@
   in[14] = _mm_srai_epi16(in[14], 6);
   in[15] = _mm_srai_epi16(in[15], 6);
 
-  RECON_AND_STORE(dest, in[0]);
-  RECON_AND_STORE(dest, in[1]);
-  RECON_AND_STORE(dest, in[2]);
-  RECON_AND_STORE(dest, in[3]);
-  RECON_AND_STORE(dest, in[4]);
-  RECON_AND_STORE(dest, in[5]);
-  RECON_AND_STORE(dest, in[6]);
-  RECON_AND_STORE(dest, in[7]);
-  RECON_AND_STORE(dest, in[8]);
-  RECON_AND_STORE(dest, in[9]);
-  RECON_AND_STORE(dest, in[10]);
-  RECON_AND_STORE(dest, in[11]);
-  RECON_AND_STORE(dest, in[12]);
-  RECON_AND_STORE(dest, in[13]);
-  RECON_AND_STORE(dest, in[14]);
-  RECON_AND_STORE(dest, in[15]);
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
 }
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_

diff --git a/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
similarity index 99%
rename from libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
rename to libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 2c10607..68e7fa4 100644
--- a/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

@@ -7,6 +7,7 @@
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
+
 %include "third_party/x86inc/x86inc.asm"
 
 ; This file provides SSSE3 version of the inverse transformation. Part

diff --git a/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000..df6f469
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/inv_wht_sse2.asm

@@ -0,0 +1,103 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movd            m%3,       [outputq]
+  movd            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  mova            m1,        [inputq + 16]
+
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET

diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/libvpx/vpx_dsp/x86/loopfilter_avx2.c
similarity index 89%
rename from libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
rename to libvpx/vpx_dsp/x86/loopfilter_avx2.c
index 439c028..23a97dd 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_avx2.c

@@ -10,6 +10,9 @@
 
 #include <immintrin.h>  /* AVX2 */
 
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh) {
@@ -100,7 +103,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -392,6 +395,11 @@
     }
 }
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh) {
@@ -401,6 +409,9 @@
     __m128i p7, p6, p5;
     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
 
     const __m128i thresh = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _thresh[0]));
@@ -409,16 +420,37 @@
     const __m128i blimit = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _blimit[0]));
 
-    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
-    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
-    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
-    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
-    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
-    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
-    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
-    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
-    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
-    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
 
     {
         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
@@ -483,7 +515,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -534,23 +566,35 @@
             flat = _mm_cmpeq_epi8(flat, zero);
             flat = _mm_and_si128(flat, mask);
 
-            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
-            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
             flat2 = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
 
             flat2 = _mm_max_epu8(work, flat2);
-            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
-            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
 
             flat2 = _mm_max_epu8(work, flat2);
 
-            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
-            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
@@ -566,29 +610,28 @@
         {
             const __m256i eight = _mm256_set1_epi16(8);
             const __m256i four = _mm256_set1_epi16(4);
-            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-                    p256_0, q256_0;
             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
                     res_q;
 
-            p256_7 = _mm256_cvtepu8_epi16(p7);
-            p256_6 = _mm256_cvtepu8_epi16(p6);
-            p256_5 = _mm256_cvtepu8_epi16(p5);
-            p256_4 = _mm256_cvtepu8_epi16(p4);
-            p256_3 = _mm256_cvtepu8_epi16(p3);
-            p256_2 = _mm256_cvtepu8_epi16(p2);
-            p256_1 = _mm256_cvtepu8_epi16(p1);
-            p256_0 = _mm256_cvtepu8_epi16(p0);
-            q256_0 = _mm256_cvtepu8_epi16(q0);
-            q256_1 = _mm256_cvtepu8_epi16(q1);
-            q256_2 = _mm256_cvtepu8_epi16(q2);
-            q256_3 = _mm256_cvtepu8_epi16(q3);
-            q256_4 = _mm256_cvtepu8_epi16(q4);
-            q256_5 = _mm256_cvtepu8_epi16(q5);
-            q256_6 = _mm256_cvtepu8_epi16(q6);
-            q256_7 = _mm256_cvtepu8_epi16(q7);
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
 
             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
                     _mm256_add_epi16(p256_4, p256_3));
@@ -933,7 +976,7 @@
     }
 }
 
-void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh, int count) {
     if (count == 1)

diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/libvpx/vpx_dsp/x86/loopfilter_mmx.asm
similarity index 97%
rename from libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
rename to libvpx/vpx_dsp/x86/loopfilter_mmx.asm
index 91055b9..b9c18b6 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/libvpx/vpx_dsp/x86/loopfilter_mmx.asm

@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
-;void vp9_lpf_horizontal_4_mmx
+;void vpx_lpf_horizontal_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
@@ -21,8 +21,8 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
-sym(vp9_lpf_horizontal_4_mmx):
+global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
+sym(vpx_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -224,7 +224,7 @@
     ret
 
 
-;void vp9_lpf_vertical_4_mmx
+;void vpx_lpf_vertical_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
@@ -233,8 +233,8 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_lpf_vertical_4_mmx) PRIVATE
-sym(vp9_lpf_vertical_4_mmx):
+global sym(vpx_lpf_vertical_4_mmx) PRIVATE
+sym(vpx_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -601,9 +601,6 @@
 t80:
     times 8 db 0x80
 align 16
-t1s:
-    times 8 db 0x01
-align 16
 t3:
     times 8 db 0x03
 align 16
@@ -612,15 +609,3 @@
 align 16
 ones:
     times 4 dw 0x0001
-align 16
-s27:
-    times 4 dw 0x1b00
-align 16
-s18:
-    times 4 dw 0x1200
-align 16
-s9:
-    times 4 dw 0x0900
-align 16
-s63:
-    times 4 dw 0x003f

diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
similarity index 68%
rename from libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
rename to libvpx/vpx_dsp/x86/loopfilter_sse2.c
index 448ad5a..ed10127 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c

@@ -9,9 +9,15 @@
  */
 
 #include <emmintrin.h>  // SSE2
-#include "vp9/common/vp9_loopfilter.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
                                             int p,
                                             const unsigned char *_blimit,
@@ -46,15 +52,12 @@
 
   {
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
-                            _mm_subs_epu8(q0p0, q1p1));
+    abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
     fe = _mm_set1_epi8(0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
-                            _mm_subs_epu8(p0q0, q0p0));
-    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
-                            _mm_subs_epu8(p1q1, q1p1));
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -68,10 +71,8 @@
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
-                                     _mm_subs_epu8(q1p1, q2p2)),
-                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
-                                     _mm_subs_epu8(q2p2, q3p3)));
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
@@ -99,7 +100,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -125,10 +126,7 @@
 
     {
       __m128i work;
-      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
-                                       _mm_subs_epu8(q0p0, q2p2)),
-                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
-                                       _mm_subs_epu8(q0p0, q3p3)));
+      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
       flat = _mm_max_epu8(abs_p1p0, flat);
       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
       flat = _mm_subs_epu8(flat, one);
@@ -142,21 +140,12 @@
       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
       q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
                                            (__m64 *)(s + 6 * p)));
-
-      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
-                                        _mm_subs_epu8(q0p0, q4p4)),
-                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
-                                        _mm_subs_epu8(q0p0, q5p5)));
+      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
 
       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
       q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
                                            (__m64 *)(s + 7 * p)));
-
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
-                                       _mm_subs_epu8(q0p0, q6p6)),
-                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
-                                       _mm_subs_epu8(q0p0, q7p7)));
-
+      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
       flat2 = _mm_subs_epu8(flat2, one);
@@ -364,20 +353,41 @@
   }
 }
 
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+                                       const __m128i *const a1,
+                                       const __m128i *const a2,
+                                       const __m128i *const s1,
+                                       const __m128i *const s2) {
+  __m128i x = _mm_add_epi16(*a1, *total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+  return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+                                   const __m128i *const other_filt,
+                                   const __m128i *const f8_lo,
+                                   const __m128i *const f8_hi) {
+  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
+                                      _mm_srli_epi16(*f8_hi, 3));
+  const __m128i result = _mm_and_si128(*flat, f8);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+                                    const __m128i *const other_filt,
+                                    const __m128i *const f_lo,
+                                    const __m128i *const f_hi) {
+  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
+                                     _mm_srli_epi16(*f_hi, 4));
+  const __m128i result = _mm_and_si128(*flat, f);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                              int p,
                                              const unsigned char *_blimit,
                                              const unsigned char *_limit,
                                              const unsigned char *_thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
-
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
-
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
-
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -387,8 +397,14 @@
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
-  int i = 0;
 
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
@@ -399,58 +415,59 @@
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-
-  _mm_store_si128((__m128i *)&ap[4 * 16], p4);
-  _mm_store_si128((__m128i *)&ap[3 * 16], p3);
-  _mm_store_si128((__m128i *)&ap[2 * 16], p2);
-  _mm_store_si128((__m128i *)&ap[1 * 16], p1);
-  _mm_store_si128((__m128i *)&ap[0 * 16], p0);
-  _mm_store_si128((__m128i *)&aq[4 * 16], q4);
-  _mm_store_si128((__m128i *)&aq[3 * 16], q3);
-  _mm_store_si128((__m128i *)&aq[2 * 16], q2);
-  _mm_store_si128((__m128i *)&aq[1 * 16], q1);
-  _mm_store_si128((__m128i *)&aq[0 * 16], q0);
-
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
 
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
     const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
     __m128i work;
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
     abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
     mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
-  // lp filter
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
@@ -459,23 +476,27 @@
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
 
-    __m128i ps1 = _mm_xor_si128(p1, t80);
-    __m128i ps0 = _mm_xor_si128(p0, t80);
-    __m128i qs0 = _mm_xor_si128(q0, t80);
-    __m128i qs1 = _mm_xor_si128(q1, t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
 
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
 
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
@@ -485,7 +506,7 @@
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
 
     // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
@@ -493,7 +514,7 @@
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
 
     // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
@@ -503,350 +524,200 @@
     filt = _mm_and_si128(filt, t7f);
     filt = _mm_or_si128(filt, work_a);
     filt = _mm_andnot_si128(hev, filt);
-    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
     // loopfilter done
 
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
     {
-      __m128i work;
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                       _mm_subs_epu8(p0, p2)),
-                           _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                        _mm_subs_epu8(q0, q2)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                       _mm_subs_epu8(p0, p3)),
-                           _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                        _mm_subs_epu8(q0, q3)));
-      flat = _mm_max_epu8(work, flat);
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
-                                       _mm_subs_epu8(p0, p4)),
-                           _mm_or_si128(_mm_subs_epu8(q4, q0),
-                                        _mm_subs_epu8(q0, q4)));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
 
-      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
-                                       _mm_subs_epu8(p0, p5)),
-                           _mm_or_si128(_mm_subs_epu8(q5, q0),
-                                        _mm_subs_epu8(q0, q5)));
-      _mm_store_si128((__m128i *)&ap[5 * 16], p5);
-      _mm_store_si128((__m128i *)&aq[5 * 16], q5);
-      flat2 = _mm_max_epu8(work, flat2);
-      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
-                                       _mm_subs_epu8(p0, p6)),
-                           _mm_or_si128(_mm_subs_epu8(q6, q0),
-                                        _mm_subs_epu8(q0, q6)));
-      _mm_store_si128((__m128i *)&ap[6 * 16], p6);
-      _mm_store_si128((__m128i *)&aq[6 * 16], q6);
-      flat2 = _mm_max_epu8(work, flat2);
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
 
-      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
-                                       _mm_subs_epu8(p0, p7)),
-                           _mm_or_si128(_mm_subs_epu8(q7, q0),
-                                        _mm_subs_epu8(q0, q7)));
-      _mm_store_si128((__m128i *)&ap[7 * 16], p7);
-      _mm_store_si128((__m128i *)&aq[7 * 16], q7);
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
     }
 
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
+    // wide flat calculations
     {
       const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i temp_flat2 = flat2;
-      unsigned char *src = s;
-      int i = 0;
-      do {
-        __m128i workp_shft;
-        __m128i a, b, c;
+      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
 
-        unsigned int off = i * 8;
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
-                               zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
-                               zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
-                               zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
-                               zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
-                               zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
-                               zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
-                               zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
-                               zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
-                               zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
-                               zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
-                               zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
-                               zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
-                               zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
-                               zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
-                               zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
-                               zero);
+      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
 
-        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
-        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+      __m128i f_lo;
+      __m128i f_hi;
 
-        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
-        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
-        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
+      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
+                           _mm_add_epi16(p4_lo, f_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+                           _mm_add_epi16(p2_lo, p1_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
 
-        _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
+      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
+                           _mm_add_epi16(p4_hi, f_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+                           _mm_add_epi16(p2_hi, p1_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
 
-        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
 
-        a = _mm_add_epi16(q1, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
 
-        a = _mm_add_epi16(q2, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
 
-        a = _mm_add_epi16(q3, a);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
 
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
 
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
 
-        c = _mm_add_epi16(q4, c);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
 
-        b = _mm_add_epi16(q3, b);
-        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
-                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
-                                          , b));
-        a = _mm_add_epi16(q5, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
 
-        a = _mm_add_epi16(q6, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
 
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
 
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
 
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        a = _mm_add_epi16(q7, a);
-        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
-        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
-                         _mm_packus_epi16(workp_shft, workp_shft));
-
-        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
-        src += 8;
-      } while (++i < 2);
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
-    p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
-
-    p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
-    work_a = _mm_andnot_si128(flat, ps1);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
-
-    p0 = _mm_load_si128((__m128i *)&flat_op[0]);
-    work_a = _mm_andnot_si128(flat, ps0);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_store_si128((__m128i *)&flat_op[0], p0);
-
-    q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
-    work_a = _mm_andnot_si128(flat, qs0);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_store_si128((__m128i *)&flat_oq[0], q0);
-
-    q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
-    work_a = _mm_andnot_si128(flat, qs1);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
-
-    work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
-    q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
-
-    // write out op6 - op3
-    {
-      unsigned char *dst = (s - 7 * p);
-      for (i = 6; i > 2; i--) {
-        __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
-        flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storeu_si128((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
-
-    work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
-    p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p2 = _mm_and_si128(flat2, p2);
-    p2 = _mm_or_si128(work_a, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-
-    work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
-    p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p1 = _mm_and_si128(flat2, p1);
-    p1 = _mm_or_si128(work_a, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-
-    work_a = _mm_load_si128((__m128i *)&flat_op[0]);
-    p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    p0 = _mm_and_si128(flat2, p0);
-    p0 = _mm_or_si128(work_a, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-
-    work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
-    q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q0 = _mm_and_si128(flat2, q0);
-    q0 = _mm_or_si128(work_a, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
-
-    work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
-    q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q1 = _mm_and_si128(flat2, q1);
-    q1 = _mm_or_si128(work_a, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-
-    work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
-    q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
-    work_a = _mm_andnot_si128(flat2, work_a);
-    q2 = _mm_and_si128(flat2, q2);
-    q2 = _mm_or_si128(work_a, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-
-    // write out oq3 - oq7
-    {
-      unsigned char *dst = (s + 3 * p);
-      for (i = 3; i < 7; i++) {
-        __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
-        flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
-        work_a = _mm_andnot_si128(flat2, work_a);
-        flat2_output = _mm_and_si128(flat2, flat2_output);
-        work_a = _mm_or_si128(work_a, flat2_output);
-        _mm_storeu_si128((__m128i *)dst, work_a);
-        dst += p;
-      }
-    }
   }
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
                                 const unsigned char *_blimit,
                                 const unsigned char *_limit,
                                 const unsigned char *_thresh, int count) {
@@ -856,16 +727,16 @@
     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }
 
-void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh, int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
@@ -893,14 +764,11 @@
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
-                            _mm_subs_epu8(q0p0, q1p1));
+    abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
 
-    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
-                            _mm_subs_epu8(p0q0, q0p0));
-    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
-                            _mm_subs_epu8(p1q1, q1p1));
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -914,10 +782,8 @@
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
-                                     _mm_subs_epu8(q1p1, q2p2)),
-                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
-                                     _mm_subs_epu8(q2p2, q3p3)));
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
@@ -925,10 +791,8 @@
 
     // flat_mask4
 
-    flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
-                                     _mm_subs_epu8(q0p0, q2p2)),
-                        _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
-                                     _mm_subs_epu8(q0p0, q3p3)));
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
+                        abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
@@ -1010,7 +874,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1079,19 +943,19 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
                                     const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
                                     const uint8_t *_blimit1,
                                     const uint8_t *_limit1,
                                     const uint8_t *_thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
@@ -1251,7 +1115,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1326,7 +1190,7 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0,
@@ -1422,7 +1286,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1469,43 +1333,47 @@
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  // Read in 16 lines
-  x0 = _mm_loadl_epi64((__m128i *)in0);
-  x8 = _mm_loadl_epi64((__m128i *)in1);
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
+  // 2-way interleave w/hoisting of unpacks
+  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
+  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
 
-  x0 = _mm_unpacklo_epi8(x0, x1);
-  x1 = _mm_unpacklo_epi8(x2, x3);
-  x2 = _mm_unpacklo_epi8(x4, x5);
-  x3 = _mm_unpacklo_epi8(x6, x7);
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
+  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
 
-  x8 = _mm_unpacklo_epi8(x8, x9);
-  x9 = _mm_unpacklo_epi8(x10, x11);
-  x10 = _mm_unpacklo_epi8(x12, x13);
-  x11 = _mm_unpacklo_epi8(x14, x15);
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
+  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
 
-  x4 = _mm_unpacklo_epi16(x0, x1);
-  x5 = _mm_unpacklo_epi16(x2, x3);
-  x12 = _mm_unpacklo_epi16(x8, x9);
-  x13 = _mm_unpacklo_epi16(x10, x11);
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
+  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
+  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
+  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
+  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
+  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
+  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
+  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
+  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
+  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
+
+  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
+  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
+  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
+  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
 
   // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
@@ -1541,33 +1409,36 @@
 
     x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
     x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     x3 = _mm_unpacklo_epi8(x6, x7);
+
     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     x4 = _mm_unpacklo_epi16(x0, x1);
     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 0*out_p),
                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
     _mm_storeh_pd((double *)(out + 1*out_p),
                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
     _mm_storel_pd((double *)(out + 2*out_p),
                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
     _mm_storeh_pd((double *)(out + 3*out_p),
@@ -1579,13 +1450,13 @@
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 4*out_p),
                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
     _mm_storeh_pd((double *)(out + 5*out_p),
                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
     _mm_storel_pd((double *)(out + 6*out_p),
                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
     _mm_storeh_pd((double *)(out + 7*out_p),
@@ -1593,13 +1464,13 @@
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1607,7 +1478,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1618,11 +1489,11 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh, int count) {
-  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
   unsigned char *src[1];
   unsigned char *dst[1];
   (void)count;
@@ -1634,7 +1505,7 @@
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1643,13 +1514,13 @@
   transpose(src, 8, dst, p, 1);
 }
 
-void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1657,7 +1528,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1669,11 +1540,11 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1697,10 +1568,10 @@
   transpose(src, 8, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
 
   // Transpose 16x16
   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);

diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000..c2a804e
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c

@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i pw_1;
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000..3784d9d
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7

diff --git a/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/libvpx/vpx_dsp/x86/sad4d_avx2.c
similarity index 79%
rename from libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
rename to libvpx/vpx_dsp/x86/sad4d_avx2.c
index 1feed62..793658f 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx2.c

@@ -8,18 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_sad32x32x4d_avx2(uint8_t *src,
+void vpx_sad32x32x4d_avx2(const uint8_t *src,
                           int src_stride,
-                          uint8_t *ref[4],
+                          const uint8_t *const ref[4],
                           int ref_stride,
-                          unsigned int res[4]) {
+                          uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m256i sum_mlow, sum_mhigh;
   int i;
-  uint8_t *ref0, *ref1, *ref2, *ref3;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
 
   ref0 = ref[0];
   ref1 = ref[1];
@@ -31,11 +32,11 @@
   sum_ref3 = _mm256_set1_epi16(0);
   for (i = 0; i < 32 ; i++) {
     // load src and all refs
-    src_reg = _mm256_loadu_si256((__m256i *)(src));
-    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
-    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
-    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
-    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
     // sum of the absolute differences between every ref-i to src
     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
@@ -80,18 +81,18 @@
   }
 }
 
-void vp9_sad64x64x4d_avx2(uint8_t *src,
+void vpx_sad64x64x4d_avx2(const uint8_t *src,
                           int src_stride,
-                          uint8_t *ref[4],
+                          const uint8_t *const ref[4],
                           int ref_stride,
-                          unsigned int res[4]) {
+                          uint32_t res[4]) {
   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
   __m256i ref3_reg, ref3next_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m256i sum_mlow, sum_mhigh;
   int i;
-  uint8_t *ref0, *ref1, *ref2, *ref3;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
 
   ref0 = ref[0];
   ref1 = ref[1];
@@ -103,16 +104,16 @@
   sum_ref3 = _mm256_set1_epi16(0);
   for (i = 0; i < 64 ; i++) {
     // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((__m256i *)(src));
-    srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
-    ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
-    ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
-    ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
-    ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
     // sum of the absolute differences between every ref-i to src
     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);

diff --git a/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm b/libvpx/vpx_dsp/x86/sad4d_sse2.asm
similarity index 98%
rename from libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm
rename to libvpx/vpx_dsp/x86/sad4d_sse2.asm
index b493628..a2f0ae7 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/libvpx/vpx_dsp/x86/sad4d_sse2.asm

@@ -167,9 +167,9 @@
   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
 %endmacro
 
-; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
-;                         unsigned int res[4]);
+;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
 %macro SADNXN4D 2
 %if UNIX64

diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000..ce9ad8f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c

@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#define FSAD64_H(h) \
+unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD32_H(h) \
+unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD64 \
+FSAD64_H(64); \
+FSAD64_H(32);
+
+#define FSAD32 \
+FSAD32_H(64); \
+FSAD32_H(32); \
+FSAD32_H(16);
+
+FSAD64;
+FSAD32;
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG32_H(h) \
+unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, \
+              _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG64 \
+FSADAVG64_H(64); \
+FSADAVG64_H(32);
+
+#define FSADAVG32 \
+FSADAVG32_H(64); \
+FSADAVG32_H(32); \
+FSADAVG32_H(16);
+
+FSADAVG64;
+FSADAVG32;
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H

diff --git a/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm b/libvpx/vpx_dsp/x86/sad_mmx.asm
similarity index 95%
rename from libvpx/vp9/encoder/x86/vp9_sad_mmx.asm
rename to libvpx/vpx_dsp/x86/sad_mmx.asm
index 32fdd23..9968992 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm
+++ b/libvpx/vpx_dsp/x86/sad_mmx.asm

@@ -11,18 +11,18 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp9_sad16x16_mmx) PRIVATE
-global sym(vp9_sad8x16_mmx) PRIVATE
-global sym(vp9_sad8x8_mmx) PRIVATE
-global sym(vp9_sad4x4_mmx) PRIVATE
-global sym(vp9_sad16x8_mmx) PRIVATE
+global sym(vpx_sad16x16_mmx) PRIVATE
+global sym(vpx_sad8x16_mmx) PRIVATE
+global sym(vpx_sad8x8_mmx) PRIVATE
+global sym(vpx_sad4x4_mmx) PRIVATE
+global sym(vpx_sad16x8_mmx) PRIVATE
 
-;unsigned int vp9_sad16x16_mmx(
+;unsigned int vpx_sad16x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp9_sad16x16_mmx):
+sym(vpx_sad16x16_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -109,12 +109,12 @@
     ret
 
 
-;unsigned int vp9_sad8x16_mmx(
+;unsigned int vpx_sad8x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp9_sad8x16_mmx):
+sym(vpx_sad8x16_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -181,12 +181,12 @@
     ret
 
 
-;unsigned int vp9_sad8x8_mmx(
+;unsigned int vpx_sad8x8_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp9_sad8x8_mmx):
+sym(vpx_sad8x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -251,12 +251,12 @@
     ret
 
 
-;unsigned int vp9_sad4x4_mmx(
+;unsigned int vpx_sad4x4_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp9_sad4x4_mmx):
+sym(vpx_sad4x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -340,12 +340,12 @@
     ret
 
 
-;unsigned int vp9_sad16x8_mmx(
+;unsigned int vpx_sad16x8_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp9_sad16x8_mmx):
+sym(vpx_sad16x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4

diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/libvpx/vpx_dsp/x86/sad_sse2.asm
similarity index 95%
rename from libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
rename to libvpx/vpx_dsp/x86/sad_sse2.asm
index c4c5c54..0defe1b 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse2.asm

@@ -44,7 +44,7 @@
 %endif ; %3 == 7
 %endmacro
 
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
@@ -87,7 +87,7 @@
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
 
-; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
@@ -132,7 +132,7 @@
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
 
-; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
@@ -178,7 +178,7 @@
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
 
-; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
@@ -222,7 +222,7 @@
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
 
-; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
+; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2

diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
similarity index 94%
rename from libvpx/vp9/encoder/x86/vp9_sad_sse3.asm
rename to libvpx/vpx_dsp/x86/sad_sse3.asm
index 2b90a5d..18279bd 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse3.asm

@@ -19,7 +19,6 @@
   %define     end_ptr       rcx
   %define     ret_var       rbx
   %define     result_ptr    arg(4)
-  %define     max_err       arg(4)
   %define     height        dword ptr arg(4)
     push        rbp
     mov         rbp,        rsp
@@ -42,7 +41,6 @@
     %define     end_ptr     r10
     %define     ret_var     r11
     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_err     [rsp+xmm_stack_space+8+4*8]
     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
@@ -52,7 +50,6 @@
     %define     end_ptr     r9
     %define     ret_var     r10
     %define     result_ptr  r8
-    %define     max_err     r8
     %define     height      r8
   %endif
 %endif
@@ -67,7 +64,6 @@
   %define     end_ptr
   %define     ret_var
   %define     result_ptr
-  %define     max_err
   %define     height
 
 %if ABI_IS_32BIT
@@ -169,14 +165,14 @@
         paddw           mm7,       mm3
 %endmacro
 
-;void int vp9_sad16x16x3_sse3(
+;void int vpx_sad16x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x3_sse3) PRIVATE
-sym(vp9_sad16x16x3_sse3):
+global sym(vpx_sad16x16x3_sse3) PRIVATE
+sym(vpx_sad16x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -211,14 +207,14 @@
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad16x8x3_sse3(
+;void int vpx_sad16x8x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x3_sse3) PRIVATE
-sym(vp9_sad16x8x3_sse3):
+global sym(vpx_sad16x8x3_sse3) PRIVATE
+sym(vpx_sad16x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -249,14 +245,14 @@
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad8x16x3_sse3(
+;void int vpx_sad8x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x16x3_sse3) PRIVATE
-sym(vp9_sad8x16x3_sse3):
+global sym(vpx_sad8x16x3_sse3) PRIVATE
+sym(vpx_sad8x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -278,14 +274,14 @@
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad8x8x3_sse3(
+;void int vpx_sad8x8x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x8x3_sse3) PRIVATE
-sym(vp9_sad8x8x3_sse3):
+global sym(vpx_sad8x8x3_sse3) PRIVATE
+sym(vpx_sad8x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -303,14 +299,14 @@
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad4x4x3_sse3(
+;void int vpx_sad4x4x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad4x4x3_sse3) PRIVATE
-sym(vp9_sad4x4x3_sse3):
+global sym(vpx_sad4x4x3_sse3) PRIVATE
+sym(vpx_sad4x4x3_sse3):
 
     STACK_FRAME_CREATE_X3
 

diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
similarity index 94%
rename from libvpx/vp9/encoder/x86/vp9_sad_sse4.asm
rename to libvpx/vpx_dsp/x86/sad_sse4.asm
index faf1768..bc67447 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse4.asm

@@ -165,14 +165,14 @@
     movdqa          [rdi + 16],    xmm2
 %endmacro
 
-;void vp9_sad16x16x8_sse4(
+;void vpx_sad16x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4) PRIVATE
-sym(vp9_sad16x16x8_sse4):
+global sym(vpx_sad16x16x8_sse4_1) PRIVATE
+sym(vpx_sad16x16x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -205,15 +205,15 @@
     ret
 
 
-;void vp9_sad16x8x8_sse4(
+;void vpx_sad16x8x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad16x8x8_sse4) PRIVATE
-sym(vp9_sad16x8x8_sse4):
+global sym(vpx_sad16x8x8_sse4_1) PRIVATE
+sym(vpx_sad16x8x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -242,15 +242,15 @@
     ret
 
 
-;void vp9_sad8x8x8_sse4(
+;void vpx_sad8x8x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x8x8_sse4) PRIVATE
-sym(vp9_sad8x8x8_sse4):
+global sym(vpx_sad8x8x8_sse4_1) PRIVATE
+sym(vpx_sad8x8x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -279,15 +279,15 @@
     ret
 
 
-;void vp9_sad8x16x8_sse4(
+;void vpx_sad8x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x16x8_sse4) PRIVATE
-sym(vp9_sad8x16x8_sse4):
+global sym(vpx_sad8x16x8_sse4_1) PRIVATE
+sym(vpx_sad8x16x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -320,15 +320,15 @@
     ret
 
 
-;void vp9_sad4x4x8_c(
+;void vpx_sad4x4x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad4x4x8_sse4) PRIVATE
-sym(vp9_sad4x4x8_sse4):
+global sym(vpx_sad4x4x8_sse4_1) PRIVATE
+sym(vpx_sad4x4x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5

diff --git a/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
similarity index 64%
rename from libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
rename to libvpx/vpx_dsp/x86/sad_ssse3.asm
index 0cb3542..49f204f 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_ssse3.asm

@@ -146,14 +146,14 @@
 
 %endmacro
 
-;void int vp9_sad16x16x3_ssse3(
+;void int vpx_sad16x16x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x3_ssse3) PRIVATE
-sym(vp9_sad16x16x3_ssse3):
+global sym(vpx_sad16x16x3_ssse3) PRIVATE
+sym(vpx_sad16x16x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -169,31 +169,31 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp .vp9_sad16x16x3_ssse3_skiptable
-.vp9_sad16x16x3_ssse3_jumptable:
-        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_skiptable:
+        jmp .vpx_sad16x16x3_ssse3_skiptable
+.vpx_sad16x16x3_ssse3_jumptable:
+        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_skiptable:
 
-        call .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_do_jump:
+        call .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
+        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
         add             rcx,        rax
@@ -203,23 +203,23 @@
 
         jmp             rcx
 
-        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
 
-.vp9_sad16x16x3_ssse3_aligned_by_15:
+.vpx_sad16x16x3_ssse3_aligned_by_15:
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
@@ -229,7 +229,7 @@
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-.vp9_sad16x16x3_ssse3_store_off:
+.vpx_sad16x16x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
@@ -259,14 +259,14 @@
     pop         rbp
     ret
 
-;void int vp9_sad16x8x3_ssse3(
+;void int vpx_sad16x8x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x3_ssse3) PRIVATE
-sym(vp9_sad16x8x3_ssse3):
+global sym(vpx_sad16x8x3_ssse3) PRIVATE
+sym(vpx_sad16x8x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -282,31 +282,31 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp .vp9_sad16x8x3_ssse3_skiptable
-.vp9_sad16x8x3_ssse3_jumptable:
-        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_skiptable:
+        jmp .vpx_sad16x8x3_ssse3_skiptable
+.vpx_sad16x8x3_ssse3_jumptable:
+        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_skiptable:
 
-        call .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_do_jump:
+        call .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
+        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
         add             rcx,        rax
@@ -316,30 +316,30 @@
 
         jmp             rcx
 
-        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
 
-.vp9_sad16x8x3_ssse3_aligned_by_15:
+.vpx_sad16x8x3_ssse3_aligned_by_15:
 
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-.vp9_sad16x8x3_ssse3_store_off:
+.vpx_sad16x8x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5

diff --git a/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
similarity index 92%
rename from libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm
rename to libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 5964a85..6d58321 100644
--- a/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm

@@ -49,11 +49,11 @@
 ;    int sp,
 ;    unsigned char *r,
 ;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
 ;
 ; TODO: Use parm passing through structure, probably don't need the pxors
 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
@@ -61,8 +61,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
-sym(vp8_ssim_parms_16x16_sse2):
+global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
+sym(vpx_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -139,11 +139,11 @@
 ;    int sp,
 ;    unsigned char *r,
 ;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
 ;
 ; TODO: Use parm passing through structure, probably don't need the pxors
 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
@@ -151,8 +151,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
-sym(vp8_ssim_parms_8x8_sse2):
+global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
+sym(vpx_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9

diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
similarity index 95%
rename from libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
rename to libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
index 1a9e4e8..05dcff7 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm

@@ -14,56 +14,32 @@
 pw_8: times  8 dw  8
 bilin_filter_m_sse2: times  8 dw 16
                      times  8 dw  0
-                     times  8 dw 15
-                     times  8 dw  1
                      times  8 dw 14
                      times  8 dw  2
-                     times  8 dw 13
-                     times  8 dw  3
                      times  8 dw 12
                      times  8 dw  4
-                     times  8 dw 11
-                     times  8 dw  5
                      times  8 dw 10
                      times  8 dw  6
-                     times  8 dw  9
-                     times  8 dw  7
                      times 16 dw  8
-                     times  8 dw  7
-                     times  8 dw  9
                      times  8 dw  6
                      times  8 dw 10
-                     times  8 dw  5
-                     times  8 dw 11
                      times  8 dw  4
                      times  8 dw 12
-                     times  8 dw  3
-                     times  8 dw 13
                      times  8 dw  2
                      times  8 dw 14
-                     times  8 dw  1
-                     times  8 dw 15
 
 bilin_filter_m_ssse3: times  8 db 16,  0
-                      times  8 db 15,  1
                       times  8 db 14,  2
-                      times  8 db 13,  3
                       times  8 db 12,  4
-                      times  8 db 11,  5
                       times  8 db 10,  6
-                      times  8 db  9,  7
                       times 16 db  8
-                      times  8 db  7,  9
                       times  8 db  6, 10
-                      times  8 db  5, 11
                       times  8 db  4, 12
-                      times  8 db  3, 13
                       times  8 db  2, 14
-                      times  8 db  1, 15
 
 SECTION .text
 
-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);
@@ -101,7 +77,7 @@
   pshufd               m4, m6, 0x1
   movd               [r1], m7           ; store sse
   paddd                m6, m4
-  movd                rax, m6           ; store sum as return value
+  movd               raxd, m6           ; store sum as return value
 %else ; mmsize == 8
   pshufw               m4, m6, 0xe
   pshufw               m3, m7, 0xe
@@ -113,7 +89,7 @@
   movd               [r1], m7           ; store sse
   pshufw               m4, m6, 0xe
   paddd                m6, m4
-  movd                rax, m6           ; store sum as return value
+  movd               raxd, m6           ; store sum as return value
 %endif
   RET
 %endmacro
@@ -149,7 +125,7 @@
     cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
                                   y_offset, dst, dst_stride, height, sse
   %endif
-  %define h heightd
+  %define block_height heightd
   %define bilin_filter sseq
 %else
   %if ARCH_X86=1 && CONFIG_PIC=1
@@ -159,7 +135,7 @@
                                   dst, dst_stride, \
                                   sec, sec_stride, \
                                   height, sse, g_bilin_filter, g_pw_8
-      %define h dword heightm
+      %define block_height dword heightm
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
@@ -177,7 +153,7 @@
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
                                 y_offset, dst, dst_stride, height, sse, \
                                 g_bilin_filter, g_pw_8
-      %define h heightd
+      %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
       GET_GOT eax
@@ -200,16 +176,16 @@
                                              sec, sec_stride, \
                                              height, sse
       %if ARCH_X86_64
-      %define h heightd
+      %define block_height heightd
       %define sec_str sec_strideq
       %else
-      %define h dword heightm
+      %define block_height dword heightm
       %define sec_str sec_stridemp
       %endif
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
                               y_offset, dst, dst_stride, height, sse
-      %define h heightd
+      %define block_height heightd
     %endif
 
     %define bilin_filter bilin_filter_m
@@ -223,7 +199,7 @@
   ; could perhaps use it for something more productive then
   pxor                 m5, m5           ; dedicated zero register
 %if %1 < 16
-  sar                   h, 1
+  sar                   block_height, 1
 %if %2 == 1 ; avg
   shl             sec_str, 1
 %endif
@@ -289,7 +265,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_zero_y_zero_loop
   STORE_AND_RET
 
@@ -361,7 +337,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_zero_y_half_loop
   STORE_AND_RET
 
@@ -490,7 +466,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_zero_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
@@ -561,7 +537,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_half_y_zero_loop
   STORE_AND_RET
 
@@ -659,7 +635,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_half_y_half_loop
   STORE_AND_RET
 
@@ -801,7 +777,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_half_y_other_loop
 %undef filter_y_a
 %undef filter_y_b
@@ -933,7 +909,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_other_y_zero_loop
 %undef filter_x_a
 %undef filter_x_b
@@ -1125,7 +1101,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_other_y_half_loop
 %undef filter_x_a
 %undef filter_x_b
@@ -1379,7 +1355,7 @@
 %if %2 == 1 ; avg
   add                secq, sec_str
 %endif
-  dec                   h
+  dec                   block_height
   jg .x_other_y_other_loop
 %undef filter_x_a
 %undef filter_x_b

diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm b/libvpx/vpx_dsp/x86/subtract_sse2.asm
similarity index 98%
rename from libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
rename to libvpx/vpx_dsp/x86/subtract_sse2.asm
index 9824080..4273efb 100644
--- a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/libvpx/vpx_dsp/x86/subtract_sse2.asm

@@ -12,7 +12,7 @@
 
 SECTION .text
 
-; void vp9_subtract_block(int rows, int cols,
+; void vpx_subtract_block(int rows, int cols,
 ;                         int16_t *diff, ptrdiff_t diff_stride,
 ;                         const uint8_t *src, ptrdiff_t src_stride,
 ;                         const uint8_t *pred, ptrdiff_t pred_stride)

diff --git a/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libvpx/vpx_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000..536b206
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/txfm_common_sse2.h

@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "vpx/vpx_integer.h"
+
+#define pair_set_epi16(a, b) \
+  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define dual_set_epi16(a, b) \
+  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_

diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000..7851a98
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/variance_avx2.c

@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse, int *sum);
+
+void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int  ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          get_var_avx2 var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += 16) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(&src[src_stride * i + j], src_stride,
+             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vpx_get16x16var_avx2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height,
+                                             unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                                 int src_stride,
+                                                 int x_offset,
+                                                 int y_offset,
+                                                 const uint8_t *dst,
+                                                 int dst_stride,
+                                                 const uint8_t *sec,
+                                                 int sec_stride,
+                                                 int height,
+                                                 unsigned int *sseptr);
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                  y_offset, dst, dst_stride,
+                                                  64, &sse1);
+  unsigned int sse2;
+  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                                  x_offset, y_offset,
+                                                  dst + 32, dst_stride,
+                                                  64, &sse2);
+  const int se = se1 + se2;
+  *sse = sse1 + sse2;
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                      y_offset, dst, dst_stride,
+                                                      sec, 64, 64, &sse1);
+  unsigned int sse2;
+  const int se2 =
+    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                        y_offset, dst + 32, dst_stride,
+                                        sec + 32, 64, 64, &sse2);
+  const int se = se1 + se2;
+
+  *sse = sse1 + sse2;
+
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  // Process 32 elements in parallel.
+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                     y_offset, dst, dst_stride,
+                                                     sec, 32, 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}

diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/libvpx/vpx_dsp/x86/variance_impl_avx2.c
similarity index 67%
rename from libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
rename to libvpx/vpx_dsp/x86/variance_impl_avx2.c
index 9aa4da9..b289e9a 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/variance_impl_avx2.c

@@ -9,44 +9,232 @@
  */
 
 #include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_variance.h"
 
 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
-  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
-  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
-  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
-  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
-  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
-  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
-  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
-  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
-  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
-  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
-  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
-  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
-  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
-  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
 };
 
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i, src_2strides, ref_2strides;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing two strides in a 256 bit register reducing the number
+    // of loop stride by half (comparing to the sse2 code)
+    src_2strides = source_stride << 1;
+    ref_2strides = recon_stride << 1;
+    for (i = 0; i < 8; i++) {
+        src = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i const *) (src_ptr)));
+        src = _mm256_inserti128_si256(src,
+              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
+
+        ref =_mm256_castsi128_si256(
+             _mm_loadu_si128((__m128i const *) (ref_ptr)));
+        ref = _mm256_inserti128_si256(ref,
+              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
+
+        // expanding to 16 bit each lane
+        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+        // src-ref
+        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+        // madd low (src - ref)
+        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+        // add high to low
+        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+        // madd high (src - ref)
+        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+        // add high to low
+        madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                       _mm256_add_epi32(madd_low, madd_high));
+
+        src_ptr+= src_2strides;
+        ref_ptr+= ref_2strides;
+    }
+
+    {
+        __m128i sum_res, madd_res;
+        __m128i expand_sum_low, expand_sum_high, expand_sum;
+        __m128i expand_madd_low, expand_madd_high, expand_madd;
+        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+        // extract the low lane and add it to the high lane
+        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+                                _mm256_extractf128_si256(sum_ref_src, 1));
+
+        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+                                 _mm256_extractf128_si256(madd_ref_src, 1));
+
+        // padding each 2 bytes with another 2 zeroed bytes
+        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
+                                            sum_res);
+        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
+                                             sum_res);
+
+        // shifting the sign 16 bits right
+        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+
+        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+
+        // expand each 32 bits of the madd result to 64 bits
+        expand_madd_low = _mm_unpacklo_epi32(madd_res,
+                          _mm256_castsi256_si128(zero_reg));
+        expand_madd_high = _mm_unpackhi_epi32(madd_res,
+                           _mm256_castsi256_si128(zero_reg));
+
+        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+
+        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
+                            _mm256_castsi256_si128(zero_reg));
+        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
+                             _mm256_castsi256_si128(zero_reg));
+
+        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+        // shift 8 bytes eight
+        madd_res = _mm_srli_si128(expand_madd, 8);
+        sum_res = _mm_srli_si128(ex_expand_sum, 8);
+
+        madd_res = _mm_add_epi32(madd_res, expand_madd);
+        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+
+        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
+
+        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
+    }
+}
+
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing 32 elements in parallel
+    for (i = 0; i < 16; i++) {
+       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
+
+       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
+
+       // expanding to 16 bit each lane
+       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+       // src-ref
+       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+       // madd low (src - ref)
+       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+       // add high to low
+       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+       // madd high (src - ref)
+       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+       // add high to low
+       madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                      _mm256_add_epi32(madd_low, madd_high));
+
+       src_ptr+= source_stride;
+       ref_ptr+= recon_stride;
+    }
+
+    {
+      __m256i expand_sum_low, expand_sum_high, expand_sum;
+      __m256i expand_madd_low, expand_madd_high, expand_madd;
+      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+      // padding each 2 bytes with another 2 zeroed bytes
+      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+
+      // shifting the sign 16 bits right
+      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+
+      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+
+      // expand each 32 bits of the madd result to 64 bits
+      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+
+      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+
+      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+
+      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+      // shift 8 bytes eight
+      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+
+      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+
+      // extract the low lane and the high lane and add the results
+      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+
+      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+    }
+}
+
 #define FILTER_SRC(filter) \
   /* filter the source */ \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
@@ -118,7 +306,7 @@
         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
 
 
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
                                              int src_stride,
                                              int x_offset,
                                              int y_offset,
@@ -309,7 +497,7 @@
   return sum;
 }
 
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
                                              int src_stride,
                                              int x_offset,
                                              int y_offset,
@@ -333,7 +521,7 @@
     if (y_offset == 0) {
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -347,7 +535,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -369,7 +557,7 @@
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_reg, zero_reg)
@@ -385,7 +573,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -409,7 +597,7 @@
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
         src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -437,7 +625,7 @@
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
@@ -459,7 +647,7 @@
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         MERGE_WITH_SRC(src_reg, zero_reg)
         sec+= sec_stride;
@@ -487,7 +675,7 @@
         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_pack, zero_reg)
@@ -524,7 +712,7 @@
         // filter the source
         FILTER_SRC(yfilter)
         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;

diff --git a/libvpx/vp8/common/x86/variance_impl_mmx.asm b/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
similarity index 88%
rename from libvpx/vp8/common/x86/variance_impl_mmx.asm
rename to libvpx/vpx_dsp/x86/variance_impl_mmx.asm
index 7d5e681..b8ba79b 100644
--- a/libvpx/vp8/common/x86/variance_impl_mmx.asm
+++ b/libvpx/vpx_dsp/x86/variance_impl_mmx.asm

@@ -11,9 +11,11 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
-global sym(vp8_get_mb_ss_mmx) PRIVATE
-sym(vp8_get_mb_ss_mmx):
+%define mmx_filter_shift            7
+
+;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
+global sym(vpx_get_mb_ss_mmx) PRIVATE
+sym(vpx_get_mb_ss_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -52,7 +54,6 @@
         movsxd      rcx, dword ptr [rsp+4]
         add         rax, rcx
 
-
     ; begin epilog
     add rsp, 8
     pop rdi
@@ -62,8 +63,7 @@
     pop         rbp
     ret
 
-
-;unsigned int vp8_get8x8var_mmx
+;void vpx_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
@@ -72,8 +72,8 @@
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp8_get8x8var_mmx) PRIVATE
-sym(vp8_get8x8var_mmx):
+global sym(vpx_get8x8var_mmx) PRIVATE
+sym(vpx_get8x8var_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -83,7 +83,6 @@
     sub         rsp, 16
     ; end prolog
 
-
         pxor        mm5, mm5                    ; Blank mmx6
         pxor        mm6, mm6                    ; Blank mmx7
         pxor        mm7, mm7                    ; Blank mmx7
@@ -117,7 +116,6 @@
         paddd       mm7, mm0                    ; accumulate in mm7
         paddd       mm7, mm2                    ; accumulate in mm7
 
-
         ; Row 2
         movq        mm0, [rax]                  ; Copy eight bytes to mm0
         movq        mm2, mm0                    ; Take copies
@@ -298,7 +296,6 @@
         mov         dword ptr [rdi], edx
         xor         rax, rax    ; return 0
 
-
     ; begin epilog
     add rsp, 16
     pop rbx
@@ -308,10 +305,8 @@
     pop         rbp
     ret
 
-
-
-;unsigned int
-;vp8_get4x4var_mmx
+;void
+;vpx_get4x4var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
@@ -320,8 +315,8 @@
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
-global sym(vp8_get4x4var_mmx) PRIVATE
-sym(vp8_get4x4var_mmx):
+global sym(vpx_get4x4var_mmx) PRIVATE
+sym(vpx_get4x4var_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -331,7 +326,6 @@
     sub         rsp, 16
     ; end prolog
 
-
         pxor        mm5, mm5                    ; Blank mmx6
         pxor        mm6, mm6                    ; Blank mmx7
         pxor        mm7, mm7                    ; Blank mmx7
@@ -354,7 +348,6 @@
         movd        mm1, [rbx]                  ; Copy four bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
-
         ; Row 2
         movd        mm0, [rax]                  ; Copy four bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
@@ -393,7 +386,6 @@
         pmaddwd     mm0, mm0                    ; square and accumulate
         paddd       mm7, mm0                    ; accumulate in mm7
 
-
         ; Now accumulate the final results.
         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
@@ -413,7 +405,6 @@
         mov         dword ptr [rdi], edx
         xor         rax, rax    ; return 0
 
-
     ; begin epilog
     add rsp, 16
     pop rbx
@@ -423,95 +414,7 @@
     pop         rbp
     ret
 
-
-
-;unsigned int
-;vp8_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp8_get4x4sse_cs_mmx) PRIVATE
-sym(vp8_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%define mmx_filter_shift            7
-
-;void vp8_filter_block2d_bil4x4_var_mmx
+;void vpx_filter_block2d_bil4x4_var_mmx
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -522,8 +425,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil4x4_var_mmx):
+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil4x4_var_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 8
@@ -533,7 +436,6 @@
     sub         rsp, 16
     ; end prolog
 
-
         pxor            mm6,            mm6                 ;
         pxor            mm7,            mm7                 ;
 
@@ -591,7 +493,6 @@
         pmullw          mm1,            [rdx+8]             ;
         paddw           mm1,            mm3                 ;
 
-
         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
         psraw           mm1,            mmx_filter_shift    ;
 
@@ -616,7 +517,6 @@
         sub             rcx,            1                   ;
         jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
 
-
         pxor            mm3,            mm3                 ;
         pxor            mm2,            mm2                 ;
 
@@ -641,8 +541,6 @@
         movd            dword ptr [rdi],          mm2                 ;
         movd            dword ptr [rsi],          mm4                 ;
 
-
-
     ; begin epilog
     add rsp, 16
     pop rdi
@@ -652,10 +550,7 @@
     pop         rbp
     ret
 
-
-
-
-;void vp8_filter_block2d_bil_var_mmx
+;void vpx_filter_block2d_bil_var_mmx
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -667,8 +562,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil_var_mmx):
+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil_var_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -809,7 +704,6 @@
         sub             rcx,            1                   ;
         jnz             .filter_block2d_bil_var_mmx_loop       ;
 
-
         pxor            mm3,            mm3                 ;
         pxor            mm2,            mm2                 ;
 
@@ -843,7 +737,6 @@
     pop         rbp
     ret
 
-
 SECTION_RODATA
 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
 align 16

diff --git a/libvpx/vpx_dsp/x86/variance_mmx.c b/libvpx/vpx_dsp/x86/variance_mmx.c
new file mode 100644
index 0000000..f04f4e2
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/variance_mmx.c

@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
+  { 128, 128, 128, 128,   0,   0,   0,   0 },
+  { 112, 112, 112, 112,  16,  16,  16,  16 },
+  {  96,  96,  96,  96,  32,  32,  32,  32 },
+  {  80,  80,  80,  80,  48,  48,  48,  48 },
+  {  64,  64,  64,  64,  64,  64,  64,  64 },
+  {  48,  48,  48,  48,  80,  80,  80,  80 },
+  {  32,  32,  32,  32,  96,  96,  96,  96 },
+  {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              unsigned int *sse, int *sum);
+
+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
+                                              int ref_pixels_per_line,
+                                              const unsigned char *src_ptr,
+                                              int src_pixels_per_line,
+                                              const int16_t *HFilter,
+                                              const int16_t *VFilter,
+                                              int *sum,
+                                              unsigned int *sumsquared);
+
+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
+                                           int ref_pixels_per_line,
+                                           const unsigned char *src_ptr,
+                                           int src_pixels_per_line,
+                                           unsigned int Height,
+                                           const int16_t *HFilter,
+                                           const int16_t *VFilter,
+                                           int *sum,
+                                           unsigned int *sumsquared);
+
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 6));
+}
+
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+                              const unsigned char *b, int b_stride,
+                              unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+                                   const unsigned char *b, int b_stride,
+                                   unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
+                                      bilinear_filters_mmx[xoffset],
+                                      bilinear_filters_mmx[yoffset],
+                                      &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+}
+
+
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    uint32_t xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
+}
+
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
+}

diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000..e6c9365
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c

@@ -0,0 +1,477 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+  // sum
+  __m128i vsum = _mm_add_epi16(diff0, diff1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+                       _mm_madd_epi16(diff1, diff1));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
+                        const uint8_t *ref, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const __m128i s = _mm_loadu_si128((const __m128i *)src);
+    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+             (int16_t)_mm_extract_epi16(vsum, 1);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          getNxMvar_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 4);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                          ptrdiff_t src_stride, \
+                                          int x_offset, int y_offset, \
+                                          const uint8_t *dst, \
+                                          ptrdiff_t dst_stride, \
+                                          int height, unsigned int *sse, \
+                                          void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+  DECL(4, opt2); \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2, NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse, \
+                                            void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse, NULL, \
+                                                    NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2, \
+                                                       NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC

diff --git a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
new file mode 100644
index 0000000..422b0fc
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c

@@ -0,0 +1,162 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+                 sse2);
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2);
+HIGH_FUN_CONV_2D(avg_ , sse2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+#endif  // HAVE_SSE2

diff --git a/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
similarity index 100%
rename from libvpx/vp9/common/x86/vp9_copy_sse2.asm
rename to libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm


diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000..bfc816f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm

@@ -0,0 +1,962 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 1, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 1, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000..72f2ff7
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm

@@ -0,0 +1,494 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm8, rdx
+    movq        xmm5, rcx
+    pshufd      xmm8, xmm8, 0b
+    movdqa      xmm1, xmm8
+    psllw       xmm8, xmm5
+    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm9, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm9, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+    pminsw      xmm2, xmm8
+    pmaxsw      xmm2, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
similarity index 78%
rename from libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
rename to libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index d109e13..29ede19 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c

@@ -8,7 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+
 #include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8 and 16_v8
@@ -53,23 +60,23 @@
 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
-                                  unsigned int src_pixels_per_line,
-                                  unsigned char *output_ptr,
-                                  unsigned int  output_pitch,
-                                  unsigned int  output_height,
-                                  int16_t *filter) {
+static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
   __m256i srcReg32b1, srcReg32b2, filtersReg32;
   unsigned int i;
-  unsigned int src_stride, dst_stride;
+  ptrdiff_t src_stride, dst_stride;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to 8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -104,9 +111,9 @@
   for (i = output_height; i > 1; i-=2) {
     // load the 2 strides of source
     srcReg32b1 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
+                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
     srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                 _mm_loadu_si128((__m128i *)
+                 _mm_loadu_si128((const __m128i *)
                  (src_ptr+src_pixels_per_line-3)), 1);
 
     // filter the source buffer
@@ -135,9 +142,9 @@
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
     srcReg32b2 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
+                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
     srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                 _mm_loadu_si128((__m128i *)
+                 _mm_loadu_si128((const __m128i *)
                  (src_ptr+src_pixels_per_line+5)), 1);
 
     // add and saturate the results together
@@ -202,7 +209,7 @@
     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     __m128i srcRegFilt2, srcRegFilt3;
 
-    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
@@ -237,7 +244,7 @@
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -297,24 +304,24 @@
   }
 }
 
-void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
-                                  unsigned int src_pitch,
-                                  unsigned char *output_ptr,
-                                  unsigned int out_pitch,
-                                  unsigned int output_height,
-                                  int16_t *filter) {
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   unsigned int i;
-  unsigned int src_stride, dst_stride;
+  ptrdiff_t src_stride, dst_stride;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the
   // same data in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -344,19 +351,19 @@
 
   // load 16 bytes 7 times in stride of src_pitch
   srcReg32b1 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr)));
+               _mm_loadu_si128((const __m128i *)(src_ptr)));
   srcReg32b2 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
   srcReg32b3 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
   srcReg32b4 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
   srcReg32b5 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
   srcReg32b6 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
   srcReg32b7 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
   // have each consecutive loads on the same 256 register
   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
@@ -393,11 +400,11 @@
      // load the last 2 loads of 16 bytes and have every two
      // consecutive loads in the same 256 bit register
      srcReg32b8 = _mm256_castsi128_si256(
-     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
      srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
      _mm256_castsi256_si128(srcReg32b8), 1);
      srcReg32b9 = _mm256_castsi128_si256(
-     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
      srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
      _mm256_castsi256_si128(srcReg32b9), 1);
 
@@ -409,35 +416,35 @@
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
      srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
-
 
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-
-     // multiply 2 adjacent elements with the filter and add the result
      srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
-
-     // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
 
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
 
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
      srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
@@ -476,7 +483,7 @@
     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the last 2 results together
     srcRegFilt4 = _mm_unpacklo_epi8(
@@ -542,3 +549,54 @@
     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
   }
 }
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
+#endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif  // HAVE_AX2 && HAVE_SSSE3

diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..772e01e
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

@@ -0,0 +1,1161 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+
+void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pixels_per_line,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t output_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pitch,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t out_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 16 bytes
+  srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 16 bytes
+    srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // merge the result together
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+#if ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
+#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
+                      out0, out1, out2, out3, out4, out5, out6, out7) { \
+  const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
+  const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
+  const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
+  const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
+                                                                        \
+  const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
+  const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
+  const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
+  const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
+                                                                        \
+  const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
+  const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
+  const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
+  const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
+                                                                        \
+  out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
+  out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
+  out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
+  out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
+  out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
+  out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
+  out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
+  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *x_filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
+  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
+  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
+  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
+  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
+  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
+  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
+  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
+  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
+  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
+  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
+  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i*)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride) {
+  __m128i A, B, C, D, E, F, G, H;
+
+  A = _mm_loadl_epi64((const __m128i *)src);
+  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
+  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
+  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
+  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
+
+  TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
+                A, B, C, D, E, F, G, H);
+
+  _mm_storel_epi64((__m128i*)dst, A);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters,
+                                    int x0_q4, int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas.  The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 8) {
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+    }
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  // TRANSPOSE...
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  //
+  // TO
+  //
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+  // 04 14 24 34
+  // 05 15 25 35
+  // 06 16 26 36
+  // 07 17 27 37
+  //
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 02 03 12 13 22 23 32 33
+  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+  // 06 07 16 17 26 27 36 37
+  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride) {
+  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
+  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
+  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
+  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
+  // 00 10 01 11 02 12 03 13
+  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
+  // 20 30 21 31 22 32 23 33
+  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  B = _mm_srli_si128(A, 4);
+  C = _mm_srli_si128(A, 8);
+  D = _mm_srli_si128(A, 12);
+
+  *(int *)(dst) =  _mm_cvtsi128_si32(A);
+  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
+  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
+  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters,
+                                    int x0_q4, int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 4) {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+    }
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
+  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
+  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
+  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
+  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
+  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
+  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
+  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters,
+                                   int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i*)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters,
+                                   int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *filter, int w) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  int i;
+
+  for (i = 0; i < w; i += 16) {
+    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
+    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+    const __m128i C =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+    const __m128i D =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+    const __m128i E =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+    const __m128i F =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    const __m128i G =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    const __m128i H =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+    // merge the result together
+    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
+    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
+    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
+    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
+    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
+    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
+    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
+    // add and saturate the results together
+    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
+    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
+    // merge the result together
+    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
+    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
+    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
+    // merge the result together
+    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
+    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
+    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
+    // add and saturate the results together
+    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
+    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
+
+    // add and saturate the results together
+    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
+    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
+    // round and shift by 7 bit each 16 bit
+    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
+    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+    src_ptr += 16;
+     // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)&dst[i], temp_hi);
+  }
+}
+
+static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *y_filters,
+                                    int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                            w);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *const x_filters,
+                             int x0_q4, int x_step_q4,
+                             const InterpKernel *const y_filters,
+                             int y0_q4, int y_step_q4,
+                             int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            w, intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            w, intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  scaledconvolve2d(src, src_stride, dst, dst_stride,
+                   filters_x, x0_q4, x_step_q4,
+                   filters_y, y0_q4, y_step_q4, w, h);
+}
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
similarity index 94%
rename from libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
rename to libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 9dc8d0a..08f3d6a 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm

@@ -176,7 +176,7 @@
     movq        [rdi + %2], xmm0
 %endm
 
-;void vp9_filter_block1d4_v8_sse2
+;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -185,8 +185,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_sse2):
+global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -243,7 +243,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_v8_sse2
+;void vpx_filter_block1d8_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -252,8 +252,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_sse2):
+global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -302,7 +302,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_v8_sse2
+;void vpx_filter_block1d16_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -311,8 +311,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_sse2):
+global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -365,8 +365,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_sse2):
+global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -423,8 +423,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_sse2):
+global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -472,8 +472,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_sse2):
+global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -525,7 +525,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d4_h8_sse2
+;void vpx_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -534,8 +534,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_sse2):
+global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -599,7 +599,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_sse2
+;void vpx_filter_block1d8_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -608,8 +608,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_sse2):
+global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -674,7 +674,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_h8_sse2
+;void vpx_filter_block1d16_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -683,8 +683,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_sse2):
+global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -769,8 +769,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_sse2):
+global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -834,8 +834,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_sse2):
+global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -900,8 +900,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_sse2):
+global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
similarity index 92%
rename from libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
rename to libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index fd781d4..68acc03 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm

@@ -18,7 +18,7 @@
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -310,7 +310,7 @@
     jnz         .loop
 %endm
 
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -319,8 +319,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_ssse3):
+global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -351,7 +351,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -360,8 +360,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
+global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -392,7 +392,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_v8_ssse3
+;void vpx_filter_block1d16_v8_ssse3
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -401,8 +401,8 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
+global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -436,8 +436,8 @@
 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_ssse3):
+global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -468,8 +468,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_ssse3):
+global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -500,8 +500,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_ssse3):
+global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -661,7 +661,7 @@
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -765,40 +765,50 @@
 
     movq        xmm0,   [rsi - 3]           ;load src data
     movq        xmm4,   [rsi + 5]
-    movq        xmm7,   [rsi + 13]
+    movq        xmm6,   [rsi + 13]
     punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm7
+    punpcklqdq  xmm4,   xmm6
 
+    movdqa      xmm7,   xmm0
+
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
     movdqa      xmm1,   xmm0
     movdqa      xmm2,   xmm0
     movdqa      xmm3,   xmm0
+
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
+    pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
+    pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
+    pmaddubsw   xmm2,   k4k5
+    pmaddubsw   xmm3,   k6k7
+    paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
     movdqa      xmm5,   xmm4
     movdqa      xmm6,   xmm4
     movdqa      xmm7,   xmm4
 
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
-    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13
 
-    pmaddubsw   xmm0,   k0k1
-    pmaddubsw   xmm1,   k2k3
-    pmaddubsw   xmm2,   k4k5
-    pmaddubsw   xmm3,   k6k7
-    pmaddubsw   xmm4,   k0k1
-    pmaddubsw   xmm5,   k2k3
-    pmaddubsw   xmm6,   k4k5
-    pmaddubsw   xmm7,   k6k7
-
-    paddsw      xmm0,   xmm3
     movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
     pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
     pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
     paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
     paddsw      xmm0,   xmm1
 
     paddsw      xmm4,   xmm7
@@ -828,7 +838,7 @@
     jnz         .loop
 %endm
 
-;void vp9_filter_block1d4_h8_ssse3
+;void vpx_filter_block1d4_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -837,8 +847,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_ssse3):
+global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -867,7 +877,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d8_h8_ssse3
+;void vpx_filter_block1d8_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -876,8 +886,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -909,7 +919,7 @@
     pop         rbp
     ret
 
-;void vp9_filter_block1d16_h8_ssse3
+;void vpx_filter_block1d16_h8_ssse3
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -918,8 +928,8 @@
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
+global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -951,8 +961,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_ssse3):
+global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -981,8 +991,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_ssse3):
+global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -1014,8 +1024,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_ssse3):
+global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
similarity index 89%
rename from libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
rename to libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index d94ccf2..a378dd0 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm

@@ -131,8 +131,8 @@
     dec         rcx
 %endm
 
-global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_sse2):
+global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -155,8 +155,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_sse2):
+global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -181,8 +181,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_sse2):
+global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -209,8 +209,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_sse2):
+global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -233,8 +233,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_sse2):
+global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -259,8 +259,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_sse2):
+global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -287,8 +287,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_sse2):
+global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -312,8 +312,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_sse2):
+global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -339,8 +339,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_sse2):
+global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -367,8 +367,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_sse2):
+global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -392,8 +392,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_sse2):
+global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -419,8 +419,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_sse2):
+global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
similarity index 88%
rename from libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
rename to libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index b5e18fe..3c8cfd2 100644
--- a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm

@@ -109,8 +109,8 @@
     dec         rcx
 %endm
 
-global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_ssse3):
+global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -133,8 +133,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_ssse3):
+global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -159,8 +159,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_ssse3):
+global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -186,8 +186,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_ssse3):
+global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -210,8 +210,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_ssse3):
+global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -236,8 +236,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_ssse3):
+global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -263,8 +263,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_ssse3):
+global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -288,8 +288,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_ssse3):
+global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -315,8 +315,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_ssse3):
+global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -342,8 +342,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_ssse3):
+global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -367,8 +367,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_ssse3):
+global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -394,8 +394,8 @@
     pop         rbp
     ret
 
-global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_ssse3):
+global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 225a3ba..c4dd785 100644
--- a/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/libvpx/vpx_mem/include/vpx_mem_intrnl.h

@@ -13,35 +13,6 @@
 #define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
 #include "./vpx_config.h"
 
-#ifndef CONFIG_MEM_MANAGER
-# if defined(VXWORKS)
-#  define CONFIG_MEM_MANAGER  1 /*include heap manager functionality,*/
-/*default: enabled on vxworks*/
-# else
-#  define CONFIG_MEM_MANAGER  0 /*include heap manager functionality*/
-# endif
-#endif /*CONFIG_MEM_MANAGER*/
-
-#ifndef CONFIG_MEM_TRACKER
-# define CONFIG_MEM_TRACKER     1 /*include xvpx_* calls in the lib*/
-#endif
-
-#ifndef CONFIG_MEM_CHECKS
-# define CONFIG_MEM_CHECKS      0 /*include some basic safety checks in
-vpx_memcpy, _memset, and _memmove*/
-#endif
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  /*use function pointers instead of compiled functions.*/
-#endif
-
-#if CONFIG_MEM_TRACKER
-# include "vpx_mem_tracker.h"
-# if VPX_MEM_TRACKER_VERSION_CHIEF != 2 || VPX_MEM_TRACKER_VERSION_MAJOR != 5
-#  error "vpx_mem requires memory tracker version 2.5 to track memory usage"
-# endif
-#endif
-
 #define ADDRESS_STORAGE_SIZE      sizeof(size_t)
 
 #ifndef DEFAULT_ALIGNMENT
@@ -54,41 +25,6 @@
 # endif
 #endif
 
-#if CONFIG_MEM_TRACKER
-# define TRY_BOUNDS_CHECK         1        /*when set to 1 pads each allocation,
-integrity can be checked using
-vpx_memory_tracker_check_integrity
-or on free by defining*/
-/*TRY_BOUNDS_CHECK_ON_FREE*/
-#else
-# define TRY_BOUNDS_CHECK         0
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if TRY_BOUNDS_CHECK
-# define TRY_BOUNDS_CHECK_ON_FREE 0          /*checks mem integrity on every
-free, very expensive*/
-# define BOUNDS_CHECK_VALUE       0xdeadbeef /*value stored before/after ea.
-mem addr for bounds checking*/
-# define BOUNDS_CHECK_PAD_SIZE    32         /*size of the padding before and
-after ea allocation to be filled
-with BOUNDS_CHECK_VALUE.
-this should be a multiple of 4*/
-#else
-# define BOUNDS_CHECK_VALUE       0
-# define BOUNDS_CHECK_PAD_SIZE    0
-#endif /*TRY_BOUNDS_CHECK*/
-
-#ifndef REMOVE_PRINTFS
-# define REMOVE_PRINTFS 0
-#endif
-
-/* Should probably use a vpx_mem logger function. */
-#if REMOVE_PRINTFS
-# define _P(x)
-#else
-# define _P(x) x
-#endif
-
 /*returns an addr aligned to the byte boundary specified by align*/
 #define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
 

diff --git a/libvpx/vpx_mem/include/vpx_mem_tracker.h b/libvpx/vpx_mem/include/vpx_mem_tracker.h
deleted file mode 100644
index 1335e00..0000000
--- a/libvpx/vpx_mem/include/vpx_mem_tracker.h
+++ /dev/null

@@ -1,179 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-#define VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-
-/* vpx_mem_tracker version info */
-#define vpx_mem_tracker_version "2.5.1.1"
-
-#define VPX_MEM_TRACKER_VERSION_CHIEF 2
-#define VPX_MEM_TRACKER_VERSION_MAJOR 5
-#define VPX_MEM_TRACKER_VERSION_MINOR 1
-#define VPX_MEM_TRACKER_VERSION_PATCH 1
-/* END - vpx_mem_tracker version info */
-
-#include <stdarg.h>
-
-struct mem_block {
-  size_t addr;
-  unsigned int size,
-           line;
-  char *file;
-  struct mem_block *prev,
-      * next;
-
-  int padded; // This mem_block has padding for integrity checks.
-  // As of right now, this should only be 0 if
-  // using vpx_mem_alloc to allocate cache memory.
-  // 2005-01-11 tjf
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-  /*
-      vpx_memory_tracker_init(int padding_size, int pad_value)
-        padding_size - the size of the padding before and after each mem addr.
-                       Values > 0 indicate that integrity checks can be performed
-                       by inspecting these areas.
-        pad_value - the initial value within the padding area before and after
-                    each mem addr.
-
-      Initializes the memory tracker interface. Should be called before any
-      other calls to the memory tracker.
-  */
-  int vpx_memory_tracker_init(int padding_size, int pad_value);
-
-  /*
-      vpx_memory_tracker_destroy()
-      Deinitializes the memory tracker interface
-  */
-  void vpx_memory_tracker_destroy();
-
-  /*
-      vpx_memory_tracker_add(size_t addr, unsigned int size,
-                           char * file, unsigned int line)
-        addr - memory address to be added to list
-        size - size of addr
-        file - the file addr was referenced from
-        line - the line in file addr was referenced from
-      Adds memory address addr, it's size, file and line it came from
-      to the memory tracker allocation table
-  */
-  void vpx_memory_tracker_add(size_t addr, unsigned int size,
-                              char *file, unsigned int line,
-                              int padded);
-
-  /*
-      vpx_memory_tracker_add(size_t addr, unsigned int size, char * file, unsigned int line)
-        addr - memory address to be added to be removed
-        padded - if 0, disables bounds checking on this memory block even if bounds
-        checking is enabled. (for example, when allocating cache memory, we still want
-        to check for memory leaks, but we do not waste cache space for bounds check padding)
-      Removes the specified address from the memory tracker's allocation
-      table
-      Return:
-        0: on success
-        -1: if memory allocation table's mutex could not be locked
-        -2: if the addr was not found in the list
-  */
-  int vpx_memory_tracker_remove(size_t addr);
-
-  /*
-      vpx_memory_tracker_find(unsigned int addr)
-        addr - address to be found in the memory tracker's
-               allocation table
-      Return:
-          If found, pointer to the memory block that matches addr
-          NULL otherwise
-  */
-  struct mem_block *vpx_memory_tracker_find(size_t addr);
-
-  /*
-      vpx_memory_tracker_dump()
-      Dumps the current contents of the memory
-      tracker allocation table
-  */
-  void vpx_memory_tracker_dump();
-
-  /*
-      vpx_memory_tracker_check_integrity()
-      If a padding_size was provided to vpx_memory_tracker_init()
-      This function will verify that the region before and after each
-      memory address contains the specified pad_value. Should the check
-      fail, the filename and line of the check will be printed out.
-  */
-  void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
-
-  /*
-      vpx_memory_tracker_set_log_type
-        type - value representing the logging type to use
-        option - type specific option. This will be interpreted differently
-                 based on the type.
-      Sets the logging type for the memory tracker.
-      Values currently supported:
-        0: if option is NULL, log to stderr, otherwise interpret option as a
-           filename and attempt to open it.
-        1: Use output_debug_string (WIN32 only), option ignored
-      Return:
-        0: on success
-        -1: if the logging type could not be set, because the value was invalid
-            or because a file could not be opened
-  */
-  int vpx_memory_tracker_set_log_type(int type, char *option);
-
-  /*
-      vpx_memory_tracker_set_log_func
-        userdata - ptr to be passed to the supplied logfunc, can be NULL
-        logfunc - the logging function to be used to output data from
-                  vpx_memory_track_dump/check_integrity
-      Sets a logging function to be used by the memory tracker.
-      Return:
-        0: on success
-        -1: if the logging type could not be set because logfunc was NULL
-  */
-  int vpx_memory_tracker_set_log_func(void *userdata,
-                                      void(*logfunc)(void *userdata,
-                                                     const char *fmt, va_list args));
-
-  /* Wrappers to standard library functions. */
-  typedef void *(* mem_track_malloc_func)(size_t);
-  typedef void *(* mem_track_calloc_func)(size_t, size_t);
-  typedef void *(* mem_track_realloc_func)(void *, size_t);
-  typedef void (* mem_track_free_func)(void *);
-  typedef void *(* mem_track_memcpy_func)(void *, const void *, size_t);
-  typedef void *(* mem_track_memset_func)(void *, int, size_t);
-  typedef void *(* mem_track_memmove_func)(void *, const void *, size_t);
-
-  /*
-      vpx_memory_tracker_set_functions
-
-      Sets the function pointers for the standard library functions.
-
-      Return:
-        0: on success
-        -1: if the use global function pointers is not set.
-  */
-  int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif  // VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_

diff --git a/libvpx/vpx_mem/memory_manager/hmm_alloc.c b/libvpx/vpx_mem/memory_manager/hmm_alloc.c
deleted file mode 100644
index ab3562d..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_alloc.c
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void *U(alloc)(U(descriptor) *desc, U(size_aau) n) {
-#ifdef HMM_AUDIT_FAIL
-
-  if (desc->avl_tree_root)
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-    if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(desc->last_freed)
-#endif
-
-      U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
-      desc->last_freed = 0;
-    }
-
-  /* Add space for block header. */
-  n += HEAD_AAUS;
-
-  /* Convert n from number of address alignment units to block alignment
-  ** units. */
-  n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
-  if (n < MIN_BLOCK_BAUS)
-    n = MIN_BLOCK_BAUS;
-
-  {
-    /* Search for the first node of the bin containing the smallest
-    ** block big enough to satisfy request. */
-    ptr_record *ptr_rec_ptr =
-      U(avl_search)(
-        (U(avl_avl) *) & (desc->avl_tree_root), (U(size_bau)) n,
-        AVL_GREATER_EQUAL);
-
-    /* If an approprate bin is found, satisfy the allocation request,
-    ** otherwise return null pointer. */
-    return(ptr_rec_ptr ?
-           U(alloc_from_bin)(desc, ptr_rec_ptr, (U(size_bau)) n) : 0);
-  }
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_base.c b/libvpx/vpx_mem/memory_manager/hmm_base.c
deleted file mode 100644
index 0eff59d..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_base.c
+++ /dev/null

@@ -1,405 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(init)(U(descriptor) *desc) {
-  desc->avl_tree_root = 0;
-  desc->last_freed = 0;
-}
-
-/* Remove a free block from a bin's doubly-linked list when it is not,
-** the first block in the bin.
-*/
-void U(dll_remove)(
-  /* Pointer to pointer record in the block to be removed. */
-  ptr_record *to_remove) {
-  to_remove->prev->next = to_remove->next;
-
-  if (to_remove->next)
-    to_remove->next->prev = to_remove->prev;
-}
-
-/* Put a block into the free collection of a heap.
-*/
-void U(into_free_collection)(
-  /* Pointer to heap descriptor. */
-  U(descriptor) *desc,
-  /* Pointer to head record of block. */
-  head_record *head_ptr) {
-  ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
-  ptr_record *bin_front_ptr =
-    U(avl_insert)((U(avl_avl) *) & (desc->avl_tree_root), ptr_rec_ptr);
-
-  if (bin_front_ptr != ptr_rec_ptr) {
-    /* The block was not inserted into the AVL tree because there is
-    ** already a bin for the size of the block. */
-
-    MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(head_ptr)
-    ptr_rec_ptr->self = ptr_rec_ptr;
-
-    /* Make the block the new second block in the bin's doubly-linked
-    ** list. */
-    ptr_rec_ptr->prev = bin_front_ptr;
-    ptr_rec_ptr->next = bin_front_ptr->next;
-    bin_front_ptr->next = ptr_rec_ptr;
-
-    if (ptr_rec_ptr->next)
-      ptr_rec_ptr->next->prev = ptr_rec_ptr;
-  } else
-    /* Block is first block in new bin. */
-    ptr_rec_ptr->next = 0;
-}
-
-/* Allocate a block from a given bin.  Returns a pointer to the payload
-** of the removed block.  The "last freed" pointer must be null prior
-** to calling this function.
-*/
-void *U(alloc_from_bin)(
-  /* Pointer to heap descriptor. */
-  U(descriptor) *desc,
-  /* Pointer to pointer record of first block in bin. */
-  ptr_record *bin_front_ptr,
-  /* Number of BAUs needed in the allocated block.  If the block taken
-  ** from the bin is significantly larger than the number of BAUs needed,
-  ** the "extra" BAUs are split off to form a new free block. */
-  U(size_bau) n_baus) {
-  head_record *head_ptr;
-  U(size_bau) rem_baus;
-
-  if (bin_front_ptr->next) {
-    /* There are multiple blocks in this bin.  Use the 2nd block in
-    ** the bin to avoid needless change to the AVL tree.
-    */
-
-    ptr_record *ptr_rec_ptr = bin_front_ptr->next;
-    head_ptr = PTR_REC_TO_HEAD(ptr_rec_ptr);
-
-#ifdef AUDIT_FAIL
-    AUDIT_BLOCK(head_ptr)
-#endif
-
-    U(dll_remove)(ptr_rec_ptr);
-  } else {
-    /* There is only one block in the bin, so it has to be removed
-    ** from the AVL tree.
-    */
-
-    head_ptr = PTR_REC_TO_HEAD(bin_front_ptr);
-
-    U(avl_remove)(
-      (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
-  }
-
-  MARK_BLOCK_ALLOCATED(head_ptr)
-
-  rem_baus = BLOCK_BAUS(head_ptr) - n_baus;
-
-  if (rem_baus >= MIN_BLOCK_BAUS) {
-    /* Since there are enough "extra" BAUs, split them off to form
-    ** a new free block.
-    */
-
-    head_record *rem_head_ptr =
-      (head_record *) BAUS_FORWARD(head_ptr, n_baus);
-
-    /* Change the next block's header to reflect the fact that the
-    ** block preceeding it is now smaller.
-    */
-    SET_PREV_BLOCK_BAUS(
-      BAUS_FORWARD(head_ptr, head_ptr->block_size), rem_baus)
-
-    head_ptr->block_size = n_baus;
-
-    rem_head_ptr->previous_block_size = n_baus;
-    rem_head_ptr->block_size = rem_baus;
-
-    desc->last_freed = rem_head_ptr;
-  }
-
-  return(HEAD_TO_PTR_REC(head_ptr));
-}
-
-/* Take a block out of the free collection.
-*/
-void U(out_of_free_collection)(
-  /* Descriptor of heap that block is in. */
-  U(descriptor) *desc,
-  /* Pointer to head of block to take out of free collection. */
-  head_record *head_ptr) {
-  ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
-  if (ptr_rec_ptr->self == ptr_rec_ptr)
-    /* Block is not the front block in its bin, so all we have to
-    ** do is take it out of the bin's doubly-linked list. */
-    U(dll_remove)(ptr_rec_ptr);
-  else {
-    ptr_record *next = ptr_rec_ptr->next;
-
-    if (next)
-      /* Block is the front block in its bin, and there is at least
-      ** one other block in the bin.  Substitute the next block for
-      ** the front block. */
-      U(avl_subst)((U(avl_avl) *) & (desc->avl_tree_root), next);
-    else
-      /* Block is the front block in its bin, but there is no other
-      ** block in the bin.  Eliminate the bin. */
-      U(avl_remove)(
-        (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
-  }
-}
-
-void U(free)(U(descriptor) *desc, void *payload_ptr) {
-  /* Flags if coalesce with adjacent block. */
-  int coalesce;
-
-  head_record *fwd_head_ptr;
-  head_record *free_head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
-  desc->num_baus_can_shrink = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
-  AUDIT_BLOCK(free_head_ptr)
-
-  /* Make sure not freeing an already free block. */
-  if (!IS_BLOCK_ALLOCATED(free_head_ptr))
-    HMM_AUDIT_FAIL
-
-    if (desc->avl_tree_root)
-      /* Audit root block in AVL tree. */
-      AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
-      fwd_head_ptr =
-        (head_record *) BAUS_FORWARD(free_head_ptr, free_head_ptr->block_size);
-
-  if (free_head_ptr->previous_block_size) {
-    /* Coalesce with backward block if possible. */
-
-    head_record *bkwd_head_ptr =
-      (head_record *) BAUS_BACKWARD(
-        free_head_ptr, free_head_ptr->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(bkwd_head_ptr)
-#endif
-
-    if (bkwd_head_ptr == (head_record *)(desc->last_freed)) {
-      desc->last_freed = 0;
-      coalesce = 1;
-    } else if (IS_BLOCK_ALLOCATED(bkwd_head_ptr))
-      coalesce = 0;
-    else {
-      U(out_of_free_collection)(desc, bkwd_head_ptr);
-      coalesce = 1;
-    }
-
-    if (coalesce) {
-      bkwd_head_ptr->block_size += free_head_ptr->block_size;
-      SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(bkwd_head_ptr))
-      free_head_ptr = bkwd_head_ptr;
-    }
-  }
-
-  if (fwd_head_ptr->block_size == 0) {
-    /* Block to be freed is last block before dummy end-of-chunk block. */
-    desc->end_of_shrinkable_chunk =
-      BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
-    desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
-    if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
-      /* Free block is the entire chunk, so shrinking can eliminate
-      ** entire chunk including dummy end block. */
-      desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
-  } else {
-    /* Coalesce with forward block if possible. */
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(fwd_head_ptr)
-#endif
-
-    if (fwd_head_ptr == (head_record *)(desc->last_freed)) {
-      desc->last_freed = 0;
-      coalesce = 1;
-    } else if (IS_BLOCK_ALLOCATED(fwd_head_ptr))
-      coalesce = 0;
-    else {
-      U(out_of_free_collection)(desc, fwd_head_ptr);
-      coalesce = 1;
-    }
-
-    if (coalesce) {
-      free_head_ptr->block_size += fwd_head_ptr->block_size;
-
-      fwd_head_ptr =
-        (head_record *) BAUS_FORWARD(
-          fwd_head_ptr, BLOCK_BAUS(fwd_head_ptr));
-
-      SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(free_head_ptr))
-
-      if (fwd_head_ptr->block_size == 0) {
-        /* Coalesced block to be freed is last block before dummy
-        ** end-of-chunk block. */
-        desc->end_of_shrinkable_chunk =
-          BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
-        desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
-        if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
-          /* Free block is the entire chunk, so shrinking can
-          ** eliminate entire chunk including dummy end block. */
-          desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
-      }
-    }
-  }
-
-  if (desc->last_freed) {
-    /* There is a last freed block, but it is not adjacent to the
-    ** block being freed by this call to free, so put the last
-    ** freed block into the free collection.
-    */
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(desc->last_freed)
-#endif
-
-    U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-  }
-
-  desc->last_freed = free_head_ptr;
-}
-
-void U(new_chunk)(U(descriptor) *desc, void *start, U(size_bau) n_baus) {
-#ifdef HMM_AUDIT_FAIL
-
-  if (desc->avl_tree_root)
-    /* Audit root block in AVL tree. */
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) start)
-
-    /* Make the chunk one big free block followed by a dummy end block.
-    */
-
-    n_baus -= DUMMY_END_BLOCK_BAUS;
-
-  HEAD_PTR->previous_block_size = 0;
-  HEAD_PTR->block_size = n_baus;
-
-  U(into_free_collection)(desc, HEAD_PTR);
-
-  /* Set up the dummy end block. */
-  start = BAUS_FORWARD(start, n_baus);
-  HEAD_PTR->previous_block_size = n_baus;
-  HEAD_PTR->block_size = 0;
-
-#undef HEAD_PTR
-}
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Function that does audit fail actions defined my preprocessor symbol,
-** and returns a dummy integer value.
-*/
-int U(audit_block_fail_dummy_return)(void) {
-  HMM_AUDIT_FAIL
-
-  /* Dummy return. */
-  return(0);
-}
-
-#endif
-
-/* AVL Tree instantiation. */
-
-#ifdef HMM_AUDIT_FAIL
-
-/* The AVL tree generic package passes an ACCESS of 1 when it "touches"
-** a child node for the first time during a particular operation.  I use
-** this feature to audit only one time (per operation) the free blocks
-** that are tree nodes.  Since the root node is not a child node, it has
-** to be audited directly.
-*/
-
-/* The pain you feel while reading these macros will not be in vain.  It
-** will remove all doubt from you mind that C++ inline functions are
-** a very good thing.
-*/
-
-#define AVL_GET_LESS(H, ACCESS) \
-  (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->self)
-#define AVL_GET_GREATER(H, ACCESS) \
-  (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->prev)
-
-#else
-
-#define AVL_GET_LESS(H, ACCESS) ((H)->self)
-#define AVL_GET_GREATER(H, ACCESS) ((H)->prev)
-
-#endif
-
-#define AVL_SET_LESS(H, LH) (H)->self = (LH);
-#define AVL_SET_GREATER(H, GH) (H)->prev = (GH);
-
-/*  high bit of high bit of
-**  block_size  previous_block_size balance factor
-**  ----------- ------------------- --------------
-**  0       0           n/a (block allocated)
-**  0       1           1
-**  1       0           -1
-**  1       1           0
-*/
-
-#define AVL_GET_BALANCE_FACTOR(H) \
-  ((((head_record *) (PTR_REC_TO_HEAD(H)))->block_size & \
-    HIGH_BIT_BAU_SIZE) ? \
-   (((head_record *) (PTR_REC_TO_HEAD(H)))->previous_block_size & \
-    HIGH_BIT_BAU_SIZE ? 0 : -1) : 1)
-
-#define AVL_SET_BALANCE_FACTOR(H, BF) \
-  {                         \
-    register head_record *p =               \
-                                            (head_record *) PTR_REC_TO_HEAD(H);       \
-    register int bal_f = (BF);              \
-    \
-    if (bal_f <= 0)                 \
-      p->block_size |= HIGH_BIT_BAU_SIZE;       \
-    else                        \
-      p->block_size &= ~HIGH_BIT_BAU_SIZE;      \
-    if (bal_f >= 0)                 \
-      p->previous_block_size |= HIGH_BIT_BAU_SIZE;  \
-    else                        \
-      p->previous_block_size &= ~HIGH_BIT_BAU_SIZE; \
-  }
-
-#define COMPARE_KEY_KEY(K1, K2) ((K1) == (K2) ? 0 : ((K1) > (K2) ? 1 : -1))
-
-#define AVL_COMPARE_KEY_NODE(K, H) \
-  COMPARE_KEY_KEY(K, BLOCK_BAUS(PTR_REC_TO_HEAD(H)))
-
-#define AVL_COMPARE_NODE_NODE(H1, H2) \
-  COMPARE_KEY_KEY(BLOCK_BAUS(PTR_REC_TO_HEAD(H1)), \
-                  BLOCK_BAUS(PTR_REC_TO_HEAD(H2)))
-
-#define AVL_NULL ((ptr_record *) 0)
-
-#define AVL_IMPL_MASK \
-  ( AVL_IMPL_INSERT | AVL_IMPL_SEARCH | AVL_IMPL_REMOVE | AVL_IMPL_SUBST )
-
-#include "cavl_impl.h"

diff --git a/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c b/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
deleted file mode 100644
index 51c3cc2..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
+++ /dev/null

@@ -1,53 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* The function in this file performs default actions if self-auditing
-** finds heap corruption.  Don't rely on this code to handle the
-** case where HMM is being used to implement the malloc and free standard
-** library functions.  Rewrite the function if necessary to avoid using
-** I/O and execution termination functions that call malloc or free.
-** In Unix, for example, you would replace the fputs calls with calls
-** to the write system call using file handle number 2.
-*/
-#include "hmm_intrnl.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static int entered = 0;
-
-/* Print abort message, file and line.  Terminate execution.
-*/
-void hmm_dflt_abort(const char *file, const char *line) {
-  /* Avoid use of printf(), which is more likely to use heap. */
-
-  if (entered)
-
-    /* The standard I/O functions called a heap function and caused
-    ** an indirect recursive call to this function.  So we'll have
-    ** to just exit without printing a message.  */
-    while (1);
-
-  entered = 1;
-
-  fputs("\n_abort - Heap corruption\n" "File: ", stderr);
-  fputs(file, stderr);
-  fputs("  Line: ", stderr);
-  fputs(line, stderr);
-  fputs("\n\n", stderr);
-  fputs("hmm_dflt_abort: while(1)!!!\n", stderr);
-  fflush(stderr);
-
-  while (1);
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_grow.c b/libvpx/vpx_mem/memory_manager/hmm_grow.c
deleted file mode 100644
index 0e86373..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_grow.c
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(grow_chunk)(U(descriptor) *desc, void *end, U(size_bau) n_baus) {
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) end)
-
-  end = BAUS_BACKWARD(end, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
-  if (HEAD_PTR->block_size != 0)
-    /* Chunk does not have valid dummy end block. */
-    HMM_AUDIT_FAIL
-
-#endif
-
-    /* Create a new block that absorbs the old dummy end block. */
-    HEAD_PTR->block_size = n_baus;
-
-  /* Set up the new dummy end block. */
-  {
-    head_record *dummy = (head_record *) BAUS_FORWARD(end, n_baus);
-    dummy->previous_block_size = n_baus;
-    dummy->block_size = 0;
-  }
-
-  /* Simply free the new block, allowing it to coalesce with any
-  ** free block at that was the last block in the chunk prior to
-  ** growth.
-  */
-  U(free)(desc, HEAD_TO_PTR_REC(end));
-
-#undef HEAD_PTR
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_largest.c b/libvpx/vpx_mem/memory_manager/hmm_largest.c
deleted file mode 100644
index 192758d..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_largest.c
+++ /dev/null

@@ -1,57 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(largest_available)(U(descriptor) *desc) {
-  U(size_bau) largest;
-
-  if (!(desc->avl_tree_root))
-    largest = 0;
-  else {
-#ifdef HMM_AUDIT_FAIL
-    /* Audit root block in AVL tree. */
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-    largest =
-      BLOCK_BAUS(
-        PTR_REC_TO_HEAD(
-          U(avl_search)(
-            (U(avl_avl) *) & (desc->avl_tree_root),
-            (U(size_bau)) ~(U(size_bau)) 0, AVL_LESS)));
-  }
-
-  if (desc->last_freed) {
-    /* Size of last freed block. */
-    register U(size_bau) lf_size;
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(desc->last_freed)
-#endif
-
-    lf_size = BLOCK_BAUS(desc->last_freed);
-
-    if (lf_size > largest)
-      largest = lf_size;
-  }
-
-  /* Convert largest size to AAUs and subract head size leaving payload
-  ** size.
-  */
-  return(largest ?
-         ((largest * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) - HEAD_AAUS) :
-         0);
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_resize.c b/libvpx/vpx_mem/memory_manager/hmm_resize.c
deleted file mode 100644
index baa5a8f..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_resize.c
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-int U(resize)(U(descriptor) *desc, void *mem, U(size_aau) n) {
-  U(size_aau) i;
-  head_record *next_head_ptr;
-  head_record *head_ptr = PTR_REC_TO_HEAD(mem);
-
-  /* Flag. */
-  int next_block_free;
-
-  /* Convert n from desired block size in AAUs to BAUs. */
-  n += HEAD_AAUS;
-  n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
-  if (n < MIN_BLOCK_BAUS)
-    n = MIN_BLOCK_BAUS;
-
-#ifdef HMM_AUDIT_FAIL
-
-  AUDIT_BLOCK(head_ptr)
-
-  if (!IS_BLOCK_ALLOCATED(head_ptr))
-    HMM_AUDIT_FAIL
-
-    if (desc->avl_tree_root)
-      AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
-      i = head_ptr->block_size;
-
-  next_head_ptr =
-    (head_record *) BAUS_FORWARD(head_ptr, head_ptr->block_size);
-
-  next_block_free =
-    (next_head_ptr == desc->last_freed) ||
-    !IS_BLOCK_ALLOCATED(next_head_ptr);
-
-  if (next_block_free)
-    /* Block can expand into next free block. */
-    i += BLOCK_BAUS(next_head_ptr);
-
-  if (n > i)
-    /* Not enough room for block to expand. */
-    return(-1);
-
-  if (next_block_free) {
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(next_head_ptr)
-#endif
-
-    if (next_head_ptr == desc->last_freed)
-      desc->last_freed = 0;
-    else
-      U(out_of_free_collection)(desc, next_head_ptr);
-
-    next_head_ptr =
-      (head_record *) BAUS_FORWARD(head_ptr, (U(size_bau)) i);
-  }
-
-  /* Set i to number of "extra" BAUs. */
-  i -= n;
-
-  if (i < MIN_BLOCK_BAUS)
-    /* Not enough extra BAUs to be a block on their own, so just keep them
-    ** in the block being resized.
-    */
-  {
-    n += i;
-    i = n;
-  } else {
-    /* There are enough "leftover" BAUs in the next block to
-    ** form a remainder block. */
-
-    head_record *rem_head_ptr;
-
-    rem_head_ptr = (head_record *) BAUS_FORWARD(head_ptr, n);
-
-    rem_head_ptr->previous_block_size = (U(size_bau)) n;
-    rem_head_ptr->block_size = (U(size_bau)) i;
-
-    if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(desc->last_freed)
-#endif
-
-      U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
-      desc->last_freed = 0;
-    }
-
-    desc->last_freed = rem_head_ptr;
-  }
-
-  head_ptr->block_size = (U(size_bau)) n;
-  next_head_ptr->previous_block_size = (U(size_bau)) i;
-
-  return(0);
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_shrink.c b/libvpx/vpx_mem/memory_manager/hmm_shrink.c
deleted file mode 100644
index f80aeea..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_shrink.c
+++ /dev/null

@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(shrink_chunk)(U(descriptor) *desc, U(size_bau) n_baus_to_shrink) {
-  head_record *dummy_end_block = (head_record *)
-                                 BAUS_BACKWARD(desc->end_of_shrinkable_chunk, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
-  if (dummy_end_block->block_size != 0)
-    /* Chunk does not have valid dummy end block. */
-    HMM_AUDIT_FAIL
-
-#endif
-
-    if (n_baus_to_shrink) {
-      head_record *last_block = (head_record *)
-                                BAUS_BACKWARD(
-                                  dummy_end_block, dummy_end_block->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(last_block)
-#endif
-
-      if (last_block == desc->last_freed) {
-        U(size_bau) bs = BLOCK_BAUS(last_block);
-
-        /* Chunk will not be shrunk out of existence if
-        ** 1.  There is at least one allocated block in the chunk
-        **     and the amount to shrink is exactly the size of the
-        **     last block, OR
-        ** 2.  After the last block is shrunk, there will be enough
-        **     BAUs left in it to form a minimal size block. */
-        int chunk_will_survive =
-          (PREV_BLOCK_BAUS(last_block) && (n_baus_to_shrink == bs)) ||
-          (n_baus_to_shrink <= (U(size_bau))(bs - MIN_BLOCK_BAUS));
-
-        if (chunk_will_survive ||
-            (!PREV_BLOCK_BAUS(last_block) &&
-             (n_baus_to_shrink ==
-              (U(size_bau))(bs + DUMMY_END_BLOCK_BAUS)))) {
-          desc->last_freed = 0;
-
-          if (chunk_will_survive) {
-            bs -= n_baus_to_shrink;
-
-            if (bs) {
-              /* The last (non-dummy) block was not completely
-              ** eliminated by the shrink. */
-
-              last_block->block_size = bs;
-
-              /* Create new dummy end record.
-              */
-              dummy_end_block =
-                (head_record *) BAUS_FORWARD(last_block, bs);
-              dummy_end_block->previous_block_size = bs;
-              dummy_end_block->block_size = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
-              if (desc->avl_tree_root)
-                AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-                U(into_free_collection)(desc, last_block);
-            } else {
-              /* The last (non-dummy) block was completely
-              ** eliminated by the shrink.  Make its head
-              ** the new dummy end block.
-              */
-              last_block->block_size = 0;
-              last_block->previous_block_size &= ~HIGH_BIT_BAU_SIZE;
-            }
-          }
-        }
-
-#ifdef HMM_AUDIT_FAIL
-        else
-          HMM_AUDIT_FAIL
-#endif
-        }
-
-#ifdef HMM_AUDIT_FAIL
-      else
-        HMM_AUDIT_FAIL
-#endif
-      }
-}

diff --git a/libvpx/vpx_mem/memory_manager/hmm_true.c b/libvpx/vpx_mem/memory_manager/hmm_true.c
deleted file mode 100644
index 4428c3e..0000000
--- a/libvpx/vpx_mem/memory_manager/hmm_true.c
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(true_size)(void *payload_ptr) {
-  register  head_record *head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
-#ifdef HMM_AUDIT_FAIL
-  AUDIT_BLOCK(head_ptr)
-#endif
-
-  /* Convert block size from BAUs to AAUs.  Subtract head size, leaving
-  ** payload size.
-  */
-  return(
-          (BLOCK_BAUS(head_ptr) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) -
-          HEAD_AAUS);
-}

diff --git a/libvpx/vpx_mem/memory_manager/include/cavl_if.h b/libvpx/vpx_mem/memory_manager/include/cavl_if.h
deleted file mode 100644
index a5ced8b..0000000
--- a/libvpx/vpx_mem/memory_manager/include/cavl_if.h
+++ /dev/null

@@ -1,228 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Interface generation header file.
-**
-** This code is in the public domain.  See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5  Author: Walt Karas
-*/
-
-/* This header contains the definition of CHAR_BIT (number of bits in a
-** char). */
-#include <limits.h>
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#ifndef AVL_SEARCH_TYPE_DEFINED_
-#define AVL_SEARCH_TYPE_DEFINED_
-
-typedef enum {
-  AVL_EQUAL = 1,
-  AVL_LESS = 2,
-  AVL_GREATER = 4,
-  AVL_LESS_EQUAL = AVL_EQUAL | AVL_LESS,
-  AVL_GREATER_EQUAL = AVL_EQUAL | AVL_GREATER
-}
-avl_search_type;
-
-#endif
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine storage class for function prototypes. */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC extern
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-typedef struct {
-#ifdef AVL_INSIDE_STRUCT
-
-  AVL_INSIDE_STRUCT
-
-#endif
-
-  AVL_HANDLE root;
-}
-L_(avl);
-
-/* Function prototypes. */
-
-L_SC void L_(init)(L_(avl) *tree);
-
-L_SC int L_(is_empty)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *tree, AVL_HANDLE h);
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *tree, AVL_KEY k, avl_search_type st);
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *tree, AVL_KEY k);
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *tree, AVL_HANDLE new_node);
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-L_SC int L_(build)(
-  L_(avl) *tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes);
-
-#endif
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits.  Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-/* Number of bits in a long. */
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-/* The macro L_BIT_ARR_DEFN defines a bit array whose index is a (0-based)
-** node depth.  The definition depends on whether the maximum depth is more
-** or less than the number of bits in a single long.
-*/
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* Maximum depth may be more than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) \
-  unsigned long NAME[((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT];
-
-#else
-
-/* Maximum depth is definitely less than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#endif
-
-/* Iterator structure. */
-typedef struct {
-  /* Tree being iterated over. */
-  L_(avl) *tree_;
-
-  /* Records a path into the tree.  If bit n is true, indicates
-  ** take greater branch from the nth node in the path, otherwise
-  ** take the less branch.  bit 0 gives branch from root, and
-  ** so on. */
-  L_BIT_ARR_DEFN(branch)
-
-  /* Zero-based depth of path into tree. */
-  unsigned depth;
-
-  /* Handles of nodes in path from root to current node (returned by *). */
-  AVL_HANDLE path_h[(AVL_MAX_DEPTH) - 1];
-}
-L_(iter);
-
-/* Iterator function prototypes. */
-
-L_SC void L_(start_iter)(
-  L_(avl) *tree, L_(iter) *iter, AVL_KEY k, avl_search_type st);
-
-L_SC void L_(start_iter_least)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC void L_(start_iter_greatest)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter);
-
-L_SC void L_(incr_iter)(L_(iter) *iter);
-
-L_SC void L_(decr_iter)(L_(iter) *iter);
-
-L_SC void L_(init_iter)(L_(iter) *iter);
-
-#define AVL_IMPL_INIT           1
-#define AVL_IMPL_IS_EMPTY       (1 << 1)
-#define AVL_IMPL_INSERT         (1 << 2)
-#define AVL_IMPL_SEARCH         (1 << 3)
-#define AVL_IMPL_SEARCH_LEAST       (1 << 4)
-#define AVL_IMPL_SEARCH_GREATEST    (1 << 5)
-#define AVL_IMPL_REMOVE         (1 << 6)
-#define AVL_IMPL_BUILD          (1 << 7)
-#define AVL_IMPL_START_ITER     (1 << 8)
-#define AVL_IMPL_START_ITER_LEAST   (1 << 9)
-#define AVL_IMPL_START_ITER_GREATEST    (1 << 10)
-#define AVL_IMPL_GET_ITER       (1 << 11)
-#define AVL_IMPL_INCR_ITER      (1 << 12)
-#define AVL_IMPL_DECR_ITER      (1 << 13)
-#define AVL_IMPL_INIT_ITER      (1 << 14)
-#define AVL_IMPL_SUBST          (1 << 15)
-
-#define AVL_IMPL_ALL            (~0)
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_

diff --git a/libvpx/vpx_mem/memory_manager/include/cavl_impl.h b/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
deleted file mode 100644
index 8b9ae27..0000000
--- a/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
+++ /dev/null

@@ -1,1152 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Implementation generation header file.
-**
-** This code is in the public domain.  See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5  Author: Walt Karas
-*/
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef l_tree
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_PREFIX
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine correct storage class for functions */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-#define L_MASK_HIGH_BIT ((int) ~ ((~ (unsigned) 0) >> 1))
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits.  Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* The maximum depth may be greater than the number of bits in a long,
-** so multiple longs are needed to hold a bit array indexed by node
-** depth. */
-
-#define L_BIT_ARR_LONGS (((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT)
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME[L_BIT_ARR_LONGS];
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) \
-  ((BIT_ARR)[(BIT_NUM) / L_LONG_BIT] & (1L << ((BIT_NUM) % L_LONG_BIT)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) \
-  (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] &= ~(1L << ((BIT_NUM) % L_LONG_BIT));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) \
-  (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] |= 1L << ((BIT_NUM) % L_LONG_BIT);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) \
-  { int i = L_BIT_ARR_LONGS; do (BIT_ARR)[--i] = 0L - (BIT_VAL); while(i); }
-
-#else /* The bit array can definitely fit in one long */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) ((BIT_ARR) & (1L << (BIT_NUM)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) (BIT_ARR) &= ~(1L << (BIT_NUM));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) (BIT_ARR) |= 1L << (BIT_NUM);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) (BIT_ARR) = 0L - (BIT_VAL);
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN) \
-  { if (AVL_READ_ERROR) return(ERROR_RETURN); }
-
-#else
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN)
-
-#endif
-
-/* The presumed reason that an instantiation places additional fields
-** inside the AVL tree structure is that the SET_ and GET_ macros
-** need these fields.  The "balance" function does not explicitly use
-** any fields in the AVL tree structure, so only pass an AVL tree
-** structure pointer to "balance" if it has instantiation-specific
-** fields that are (presumably) needed by the SET_/GET_ calls within
-** "balance".
-*/
-#ifdef AVL_INSIDE_STRUCT
-
-#define L_BALANCE_PARAM_CALL_PREFIX l_tree,
-#define L_BALANCE_PARAM_DECL_PREFIX L_(avl) *l_tree,
-
-#else
-
-#define L_BALANCE_PARAM_CALL_PREFIX
-#define L_BALANCE_PARAM_DECL_PREFIX
-
-#endif
-
-#ifdef AVL_IMPL_MASK
-
-#define L_IMPL_MASK (AVL_IMPL_MASK)
-
-#else
-
-/* Define all functions. */
-#define L_IMPL_MASK AVL_IMPL_ALL
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT)
-
-L_SC void L_(init)(L_(avl) *l_tree) {
-  l_tree->root = AVL_NULL;
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_IS_EMPTY)
-
-L_SC int L_(is_empty)(L_(avl) *l_tree) {
-  return(l_tree->root == AVL_NULL);
-}
-
-#endif
-
-/* Put the private balance function in the same compilation module as
-** the insert function.  */
-#if (L_IMPL_MASK & AVL_IMPL_INSERT)
-
-/* Balances subtree, returns handle of root node of subtree after balancing.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h) {
-  AVL_HANDLE deep_h;
-
-  /* Either the "greater than" or the "less than" subtree of
-  ** this node has to be 2 levels deeper (or else it wouldn't
-  ** need balancing).
-  */
-  if (AVL_GET_BALANCE_FACTOR(bal_h) > 0) {
-    /* "Greater than" subtree is deeper. */
-
-    deep_h = AVL_GET_GREATER(bal_h, 1);
-
-    L_CHECK_READ_ERROR(AVL_NULL)
-
-    if (AVL_GET_BALANCE_FACTOR(deep_h) < 0) {
-      int bf;
-
-      AVL_HANDLE old_h = bal_h;
-      bal_h = AVL_GET_LESS(deep_h, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-      AVL_SET_GREATER(old_h, AVL_GET_LESS(bal_h, 1))
-      AVL_SET_LESS(deep_h, AVL_GET_GREATER(bal_h, 1))
-      AVL_SET_LESS(bal_h, old_h)
-      AVL_SET_GREATER(bal_h, deep_h)
-
-      bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
-      if (bf != 0) {
-        if (bf > 0) {
-          AVL_SET_BALANCE_FACTOR(old_h, -1)
-          AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        } else {
-          AVL_SET_BALANCE_FACTOR(deep_h, 1)
-          AVL_SET_BALANCE_FACTOR(old_h, 0)
-        }
-
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      } else {
-        AVL_SET_BALANCE_FACTOR(old_h, 0)
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-      }
-    } else {
-      AVL_SET_GREATER(bal_h, AVL_GET_LESS(deep_h, 0))
-      AVL_SET_LESS(deep_h, bal_h)
-
-      if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
-        AVL_SET_BALANCE_FACTOR(deep_h, -1)
-        AVL_SET_BALANCE_FACTOR(bal_h, 1)
-      } else {
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      }
-
-      bal_h = deep_h;
-    }
-  } else {
-    /* "Less than" subtree is deeper. */
-
-    deep_h = AVL_GET_LESS(bal_h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-
-    if (AVL_GET_BALANCE_FACTOR(deep_h) > 0) {
-      int bf;
-      AVL_HANDLE old_h = bal_h;
-      bal_h = AVL_GET_GREATER(deep_h, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-      AVL_SET_LESS(old_h, AVL_GET_GREATER(bal_h, 0))
-      AVL_SET_GREATER(deep_h, AVL_GET_LESS(bal_h, 0))
-      AVL_SET_GREATER(bal_h, old_h)
-      AVL_SET_LESS(bal_h, deep_h)
-
-      bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
-      if (bf != 0) {
-        if (bf < 0) {
-          AVL_SET_BALANCE_FACTOR(old_h, 1)
-          AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        } else {
-          AVL_SET_BALANCE_FACTOR(deep_h, -1)
-          AVL_SET_BALANCE_FACTOR(old_h, 0)
-        }
-
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      } else {
-        AVL_SET_BALANCE_FACTOR(old_h, 0)
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-      }
-    } else {
-      AVL_SET_LESS(bal_h, AVL_GET_GREATER(deep_h, 0))
-      AVL_SET_GREATER(deep_h, bal_h)
-
-      if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
-        AVL_SET_BALANCE_FACTOR(deep_h, 1)
-        AVL_SET_BALANCE_FACTOR(bal_h, -1)
-      } else {
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      }
-
-      bal_h = deep_h;
-    }
-  }
-
-  return(bal_h);
-}
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *l_tree, AVL_HANDLE h) {
-  AVL_SET_LESS(h, AVL_NULL)
-  AVL_SET_GREATER(h, AVL_NULL)
-  AVL_SET_BALANCE_FACTOR(h, 0)
-
-  if (l_tree->root == AVL_NULL)
-    l_tree->root = h;
-  else {
-    /* Last unbalanced node encountered in search for insertion point. */
-    AVL_HANDLE unbal = AVL_NULL;
-    /* Parent of last unbalanced node. */
-    AVL_HANDLE parent_unbal = AVL_NULL;
-    /* Balance factor of last unbalanced node. */
-    int unbal_bf;
-
-    /* Zero-based depth in tree. */
-    unsigned depth = 0, unbal_depth = 0;
-
-    /* Records a path into the tree.  If bit n is true, indicates
-    ** take greater branch from the nth node in the path, otherwise
-    ** take the less branch.  bit 0 gives branch from root, and
-    ** so on. */
-    L_BIT_ARR_DEFN(branch)
-
-    AVL_HANDLE hh = l_tree->root;
-    AVL_HANDLE parent = AVL_NULL;
-    int cmp;
-
-    do {
-      if (AVL_GET_BALANCE_FACTOR(hh) != 0) {
-        unbal = hh;
-        parent_unbal = parent;
-        unbal_depth = depth;
-      }
-
-      cmp = AVL_COMPARE_NODE_NODE(h, hh);
-
-      if (cmp == 0)
-        /* Duplicate key. */
-        return(hh);
-
-      parent = hh;
-
-      if (cmp > 0) {
-        hh = AVL_GET_GREATER(hh, 1);
-        L_BIT_ARR_1(branch, depth)
-      } else {
-        hh = AVL_GET_LESS(hh, 1);
-        L_BIT_ARR_0(branch, depth)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-    } while (hh != AVL_NULL);
-
-    /*  Add node to insert as leaf of tree. */
-    if (cmp < 0)
-      AVL_SET_LESS(parent, h)
-      else
-        AVL_SET_GREATER(parent, h)
-
-        depth = unbal_depth;
-
-    if (unbal == AVL_NULL)
-      hh = l_tree->root;
-    else {
-      cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-      depth++;
-      unbal_bf = AVL_GET_BALANCE_FACTOR(unbal);
-
-      if (cmp < 0)
-        unbal_bf--;
-      else  /* cmp > 0 */
-        unbal_bf++;
-
-      hh = cmp < 0 ? AVL_GET_LESS(unbal, 1) : AVL_GET_GREATER(unbal, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-
-      if ((unbal_bf != -2) && (unbal_bf != 2)) {
-        /* No rebalancing of tree is necessary. */
-        AVL_SET_BALANCE_FACTOR(unbal, unbal_bf)
-        unbal = AVL_NULL;
-      }
-    }
-
-    if (hh != AVL_NULL)
-      while (h != hh) {
-        cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-        depth++;
-
-        if (cmp < 0) {
-          AVL_SET_BALANCE_FACTOR(hh, -1)
-          hh = AVL_GET_LESS(hh, 1);
-        } else { /* cmp > 0 */
-          AVL_SET_BALANCE_FACTOR(hh, 1)
-          hh = AVL_GET_GREATER(hh, 1);
-        }
-
-        L_CHECK_READ_ERROR(AVL_NULL)
-      }
-
-    if (unbal != AVL_NULL) {
-      unbal = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX unbal);
-      L_CHECK_READ_ERROR(AVL_NULL)
-
-      if (parent_unbal == AVL_NULL)
-        l_tree->root = unbal;
-      else {
-        depth = unbal_depth - 1;
-        cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
-        if (cmp < 0)
-          AVL_SET_LESS(parent_unbal, unbal)
-          else  /* cmp > 0 */
-            AVL_SET_GREATER(parent_unbal, unbal)
-          }
-    }
-
-  }
-
-  return(h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH)
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *l_tree, AVL_KEY k, avl_search_type st) {
-  int cmp, target_cmp;
-  AVL_HANDLE match_h = AVL_NULL;
-  AVL_HANDLE h = l_tree->root;
-
-  if (st & AVL_LESS)
-    target_cmp = 1;
-  else if (st & AVL_GREATER)
-    target_cmp = -1;
-  else
-    target_cmp = 0;
-
-  while (h != AVL_NULL) {
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0) {
-      if (st & AVL_EQUAL) {
-        match_h = h;
-        break;
-      }
-
-      cmp = -target_cmp;
-    } else if (target_cmp != 0)
-      if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
-        /* cmp and target_cmp are both positive or both negative. */
-        match_h = h;
-
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(match_h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_LEAST)
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *l_tree) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-
-  while (h != AVL_NULL) {
-    parent = h;
-    h = AVL_GET_LESS(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_GREATEST)
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *l_tree) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-
-  while (h != AVL_NULL) {
-    parent = h;
-    h = AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_REMOVE)
-
-/* Prototype of balance function (called by remove) in case not in
-** same compilation unit.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *l_tree, AVL_KEY k) {
-  /* Zero-based depth in tree. */
-  unsigned depth = 0, rm_depth;
-
-  /* Records a path into the tree.  If bit n is true, indicates
-  ** take greater branch from the nth node in the path, otherwise
-  ** take the less branch.  bit 0 gives branch from root, and
-  ** so on. */
-  L_BIT_ARR_DEFN(branch)
-
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-  AVL_HANDLE child;
-  AVL_HANDLE path;
-  int cmp, cmp_shortened_sub_with_path;
-  int reduced_depth;
-  int bf;
-  AVL_HANDLE rm;
-  AVL_HANDLE parent_rm;
-
-  for (;;) {
-    if (h == AVL_NULL)
-      /* No node in tree with given key. */
-      return(AVL_NULL);
-
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0)
-      /* Found node to remove. */
-      break;
-
-    parent = h;
-
-    if (cmp > 0) {
-      h = AVL_GET_GREATER(h, 1);
-      L_BIT_ARR_1(branch, depth)
-    } else {
-      h = AVL_GET_LESS(h, 1);
-      L_BIT_ARR_0(branch, depth)
-    }
-
-    L_CHECK_READ_ERROR(AVL_NULL)
-    depth++;
-    cmp_shortened_sub_with_path = cmp;
-  }
-
-  rm = h;
-  parent_rm = parent;
-  rm_depth = depth;
-
-  /* If the node to remove is not a leaf node, we need to get a
-  ** leaf node, or a node with a single leaf as its child, to put
-  ** in the place of the node to remove.  We will get the greatest
-  ** node in the less subtree (of the node to remove), or the least
-  ** node in the greater subtree.  We take the leaf node from the
-  ** deeper subtree, if there is one. */
-
-  if (AVL_GET_BALANCE_FACTOR(h) < 0) {
-    child = AVL_GET_LESS(h, 1);
-    L_BIT_ARR_0(branch, depth)
-    cmp = -1;
-  } else {
-    child = AVL_GET_GREATER(h, 1);
-    L_BIT_ARR_1(branch, depth)
-    cmp = 1;
-  }
-
-  L_CHECK_READ_ERROR(AVL_NULL)
-  depth++;
-
-  if (child != AVL_NULL) {
-    cmp = -cmp;
-
-    do {
-      parent = h;
-      h = child;
-
-      if (cmp < 0) {
-        child = AVL_GET_LESS(h, 1);
-        L_BIT_ARR_0(branch, depth)
-      } else {
-        child = AVL_GET_GREATER(h, 1);
-        L_BIT_ARR_1(branch, depth)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-    } while (child != AVL_NULL);
-
-    if (parent == rm)
-      /* Only went through do loop once.  Deleted node will be replaced
-      ** in the tree structure by one of its immediate children. */
-      cmp_shortened_sub_with_path = -cmp;
-    else
-      cmp_shortened_sub_with_path = cmp;
-
-    /* Get the handle of the opposite child, which may not be null. */
-    child = cmp > 0 ? AVL_GET_LESS(h, 0) : AVL_GET_GREATER(h, 0);
-  }
-
-  if (parent == AVL_NULL)
-    /* There were only 1 or 2 nodes in this tree. */
-    l_tree->root = child;
-  else if (cmp_shortened_sub_with_path < 0)
-    AVL_SET_LESS(parent, child)
-    else
-      AVL_SET_GREATER(parent, child)
-
-      /* "path" is the parent of the subtree being eliminated or reduced
-      ** from a depth of 2 to 1.  If "path" is the node to be removed, we
-      ** set path to the node we're about to poke into the position of the
-      ** node to be removed. */
-      path = parent == rm ? h : parent;
-
-  if (h != rm) {
-    /* Poke in the replacement for the node to be removed. */
-    AVL_SET_LESS(h, AVL_GET_LESS(rm, 0))
-    AVL_SET_GREATER(h, AVL_GET_GREATER(rm, 0))
-    AVL_SET_BALANCE_FACTOR(h, AVL_GET_BALANCE_FACTOR(rm))
-
-    if (parent_rm == AVL_NULL)
-      l_tree->root = h;
-    else {
-      depth = rm_depth - 1;
-
-      if (L_BIT_ARR_VAL(branch, depth))
-        AVL_SET_GREATER(parent_rm, h)
-        else
-          AVL_SET_LESS(parent_rm, h)
-        }
-  }
-
-  if (path != AVL_NULL) {
-    /* Create a temporary linked list from the parent of the path node
-    ** to the root node. */
-    h = l_tree->root;
-    parent = AVL_NULL;
-    depth = 0;
-
-    while (h != path) {
-      if (L_BIT_ARR_VAL(branch, depth)) {
-        child = AVL_GET_GREATER(h, 1);
-        AVL_SET_GREATER(h, parent)
-      } else {
-        child = AVL_GET_LESS(h, 1);
-        AVL_SET_LESS(h, parent)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-      parent = h;
-      h = child;
-    }
-
-    /* Climb from the path node to the root node using the linked
-    ** list, restoring the tree structure and rebalancing as necessary.
-    */
-    reduced_depth = 1;
-    cmp = cmp_shortened_sub_with_path;
-
-    for (;;) {
-      if (reduced_depth) {
-        bf = AVL_GET_BALANCE_FACTOR(h);
-
-        if (cmp < 0)
-          bf++;
-        else  /* cmp > 0 */
-          bf--;
-
-        if ((bf == -2) || (bf == 2)) {
-          h = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX h);
-          L_CHECK_READ_ERROR(AVL_NULL)
-          bf = AVL_GET_BALANCE_FACTOR(h);
-        } else
-          AVL_SET_BALANCE_FACTOR(h, bf)
-          reduced_depth = (bf == 0);
-      }
-
-      if (parent == AVL_NULL)
-        break;
-
-      child = h;
-      h = parent;
-      depth--;
-      cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
-      if (cmp < 0) {
-        parent = AVL_GET_LESS(h, 1);
-        AVL_SET_LESS(h, child)
-      } else {
-        parent = AVL_GET_GREATER(h, 1);
-        AVL_SET_GREATER(h, child)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-    }
-
-    l_tree->root = h;
-  }
-
-  return(rm);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SUBST)
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *l_tree, AVL_HANDLE new_node) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-  int cmp, last_cmp;
-
-  /* Search for node already in tree with same key. */
-  for (;;) {
-    if (h == AVL_NULL)
-      /* No node in tree with same key as new node. */
-      return(AVL_NULL);
-
-    cmp = AVL_COMPARE_NODE_NODE(new_node, h);
-
-    if (cmp == 0)
-      /* Found the node to substitute new one for. */
-      break;
-
-    last_cmp = cmp;
-    parent = h;
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  /* Copy tree housekeeping fields from node in tree to new node. */
-  AVL_SET_LESS(new_node, AVL_GET_LESS(h, 0))
-  AVL_SET_GREATER(new_node, AVL_GET_GREATER(h, 0))
-  AVL_SET_BALANCE_FACTOR(new_node, AVL_GET_BALANCE_FACTOR(h))
-
-  if (parent == AVL_NULL)
-    /* New node is also new root. */
-    l_tree->root = new_node;
-  else {
-    /* Make parent point to new node. */
-    if (last_cmp < 0)
-      AVL_SET_LESS(parent, new_node)
-      else
-        AVL_SET_GREATER(parent, new_node)
-      }
-
-  return(h);
-}
-
-#endif
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-#if (L_IMPL_MASK & AVL_IMPL_BUILD)
-
-L_SC int L_(build)(
-  L_(avl) *l_tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes) {
-  /* Gives path to subtree being built.  If bit n is false, branch
-  ** less from the node at depth n, if true branch greater. */
-  L_BIT_ARR_DEFN(branch)
-
-  /* If bit n is true, then for the current subtree at depth n, its
-  ** greater subtree has one more node than its less subtree. */
-  L_BIT_ARR_DEFN(rem)
-
-  /* Depth of root node of current subtree. */
-  unsigned depth = 0;
-
-  /* Number of nodes in current subtree. */
-  L_SIZE num_sub = num_nodes;
-
-  /* The algorithm relies on a stack of nodes whose less subtree has
-  ** been built, but whose greater subtree has not yet been built.
-  ** The stack is implemented as linked list.  The nodes are linked
-  ** together by having the "greater" handle of a node set to the
-  ** next node in the list.  "less_parent" is the handle of the first
-  ** node in the list. */
-  AVL_HANDLE less_parent = AVL_NULL;
-
-  /* h is root of current subtree, child is one of its children. */
-  AVL_HANDLE h;
-  AVL_HANDLE child;
-
-  if (num_nodes == 0) {
-    l_tree->root = AVL_NULL;
-    return(1);
-  }
-
-  for (;;) {
-    while (num_sub > 2) {
-      /* Subtract one for root of subtree. */
-      num_sub--;
-
-      if (num_sub & 1)
-        L_BIT_ARR_1(rem, depth)
-        else
-          L_BIT_ARR_0(rem, depth)
-          L_BIT_ARR_0(branch, depth)
-          depth++;
-
-      num_sub >>= 1;
-    }
-
-    if (num_sub == 2) {
-      /* Build a subtree with two nodes, slanting to greater.
-      ** I arbitrarily chose to always have the extra node in the
-      ** greater subtree when there is an odd number of nodes to
-      ** split between the two subtrees. */
-
-      h = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      child = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      AVL_SET_LESS(child, AVL_NULL)
-      AVL_SET_GREATER(child, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(child, 0)
-      AVL_SET_GREATER(h, child)
-      AVL_SET_LESS(h, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(h, 1)
-    } else { /* num_sub == 1 */
-      /* Build a subtree with one node. */
-
-      h = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      AVL_SET_LESS(h, AVL_NULL)
-      AVL_SET_GREATER(h, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(h, 0)
-    }
-
-    while (depth) {
-      depth--;
-
-      if (!L_BIT_ARR_VAL(branch, depth))
-        /* We've completed a less subtree. */
-        break;
-
-      /* We've completed a greater subtree, so attach it to
-      ** its parent (that is less than it).  We pop the parent
-      ** off the stack of less parents. */
-      child = h;
-      h = less_parent;
-      less_parent = AVL_GET_GREATER(h, 1);
-      L_CHECK_READ_ERROR(0)
-      AVL_SET_GREATER(h, child)
-      /* num_sub = 2 * (num_sub - rem[depth]) + rem[depth] + 1 */
-      num_sub <<= 1;
-      num_sub += L_BIT_ARR_VAL(rem, depth) ? 0 : 1;
-
-      if (num_sub & (num_sub - 1))
-        /* num_sub is not a power of 2. */
-        AVL_SET_BALANCE_FACTOR(h, 0)
-        else
-          /* num_sub is a power of 2. */
-          AVL_SET_BALANCE_FACTOR(h, 1)
-        }
-
-    if (num_sub == num_nodes)
-      /* We've completed the full tree. */
-      break;
-
-    /* The subtree we've completed is the less subtree of the
-    ** next node in the sequence. */
-
-    child = h;
-    h = AVL_BUILD_ITER_VAL(p);
-    L_CHECK_READ_ERROR(0)
-    AVL_BUILD_ITER_INCR(p)
-    AVL_SET_LESS(h, child)
-
-    /* Put h into stack of less parents. */
-    AVL_SET_GREATER(h, less_parent)
-    less_parent = h;
-
-    /* Proceed to creating greater than subtree of h. */
-    L_BIT_ARR_1(branch, depth)
-    num_sub += L_BIT_ARR_VAL(rem, depth) ? 1 : 0;
-    depth++;
-
-  } /* end for (;; ) */
-
-  l_tree->root = h;
-
-  return(1);
-}
-
-#endif
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT_ITER)
-
-/* Initialize depth to invalid value, to indicate iterator is
-** invalid.   (Depth is zero-base.)  It's not necessary to initialize
-** iterators prior to passing them to the "start" function.
-*/
-L_SC void L_(init_iter)(L_(iter) *iter) {
-  iter->depth = ~0;
-}
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR_INV_DEPTH \
-  { if (AVL_READ_ERROR) { iter->depth = ~0; return; } }
-
-#else
-
-#define L_CHECK_READ_ERROR_INV_DEPTH
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER)
-
-L_SC void L_(start_iter)(
-  L_(avl) *l_tree, L_(iter) *iter, AVL_KEY k, avl_search_type st) {
-  AVL_HANDLE h = l_tree->root;
-  unsigned d = 0;
-  int cmp, target_cmp;
-
-  /* Save the tree that we're going to iterate through in a
-  ** member variable. */
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  if (h == AVL_NULL)
-    /* Tree is empty. */
-    return;
-
-  if (st & AVL_LESS)
-    /* Key can be greater than key of starting node. */
-    target_cmp = 1;
-  else if (st & AVL_GREATER)
-    /* Key can be less than key of starting node. */
-    target_cmp = -1;
-  else
-    /* Key must be same as key of starting node. */
-    target_cmp = 0;
-
-  for (;;) {
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0) {
-      if (st & AVL_EQUAL) {
-        /* Equal node was sought and found as starting node. */
-        iter->depth = d;
-        break;
-      }
-
-      cmp = -target_cmp;
-    } else if (target_cmp != 0)
-      if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
-        /* cmp and target_cmp are both negative or both positive. */
-        iter->depth = d;
-
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      break;
-
-    if (cmp > 0)
-      L_BIT_ARR_1(iter->branch, d)
-      else
-        L_BIT_ARR_0(iter->branch, d)
-        iter->path_h[d++] = h;
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_LEAST)
-
-L_SC void L_(start_iter_least)(L_(avl) *l_tree, L_(iter) *iter) {
-  AVL_HANDLE h = l_tree->root;
-
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  L_BIT_ARR_ALL(iter->branch, 0)
-
-  while (h != AVL_NULL) {
-    if (iter->depth != ~0)
-      iter->path_h[iter->depth] = h;
-
-    iter->depth++;
-    h = AVL_GET_LESS(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_GREATEST)
-
-L_SC void L_(start_iter_greatest)(L_(avl) *l_tree, L_(iter) *iter) {
-  AVL_HANDLE h = l_tree->root;
-
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  L_BIT_ARR_ALL(iter->branch, 1)
-
-  while (h != AVL_NULL) {
-    if (iter->depth != ~0)
-      iter->path_h[iter->depth] = h;
-
-    iter->depth++;
-    h = AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_GET_ITER)
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter) {
-  if (iter->depth == ~0)
-    return(AVL_NULL);
-
-  return(iter->depth == 0 ?
-         iter->tree_->root : iter->path_h[iter->depth - 1]);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INCR_ITER)
-
-L_SC void L_(incr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
-  if (iter->depth != ~0) {
-    AVL_HANDLE h =
-      AVL_GET_GREATER((iter->depth == 0 ?
-                       iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      do {
-        if (iter->depth == 0) {
-          iter->depth = ~0;
-          break;
-        }
-
-        iter->depth--;
-      } while (L_BIT_ARR_VAL(iter->branch, iter->depth));
-    else {
-      L_BIT_ARR_1(iter->branch, iter->depth)
-      iter->path_h[iter->depth++] = h;
-
-      for (;;) {
-        h = AVL_GET_LESS(h, 1);
-        L_CHECK_READ_ERROR_INV_DEPTH
-
-        if (h == AVL_NULL)
-          break;
-
-        L_BIT_ARR_0(iter->branch, iter->depth)
-        iter->path_h[iter->depth++] = h;
-      }
-    }
-  }
-
-#undef l_tree
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_DECR_ITER)
-
-L_SC void L_(decr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
-  if (iter->depth != ~0) {
-    AVL_HANDLE h =
-      AVL_GET_LESS((iter->depth == 0 ?
-                    iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      do {
-        if (iter->depth == 0) {
-          iter->depth = ~0;
-          break;
-        }
-
-        iter->depth--;
-      } while (!L_BIT_ARR_VAL(iter->branch, iter->depth));
-    else {
-      L_BIT_ARR_0(iter->branch, iter->depth)
-      iter->path_h[iter->depth++] = h;
-
-      for (;;) {
-        h = AVL_GET_GREATER(h, 1);
-        L_CHECK_READ_ERROR_INV_DEPTH
-
-        if (h == AVL_NULL)
-          break;
-
-        L_BIT_ARR_1(iter->branch, iter->depth)
-        iter->path_h[iter->depth++] = h;
-      }
-    }
-  }
-
-#undef l_tree
-}
-
-#endif
-
-/* Tidy up the preprocessor symbol name space. */
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_CALL_PREFIX
-#undef L_BALANCE_PARAM_DECL_PREFIX
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_

diff --git a/libvpx/vpx_mem/memory_manager/include/heapmm.h b/libvpx/vpx_mem/memory_manager/include/heapmm.h
deleted file mode 100644
index d584b19..0000000
--- a/libvpx/vpx_mem/memory_manager/include/heapmm.h
+++ /dev/null

@@ -1,155 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* External header file for Heap Memory Manager.  See documentation in
-** heapmm.html.
-*/
-
-#undef HMM_PROCESS
-
-/* Include once per configuration in a particular translation unit. */
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-#ifndef HMM_INC_CNFG_DFLT
-#define HMM_INC_CNFG_DFLT
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 0
-
-/* Test configuration. */
-
-#ifndef HMM_INC_CNFG_0
-#define HMM_INC_CNFG_0
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 1
-
-#ifndef HMM_INC_CNFG_1
-#define HMM_INC_CNFG_1
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 2
-
-#ifndef HMM_INC_CNFG_2
-#define HMM_INC_CNFG_2
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 3
-
-#ifndef HMM_INC_CNFG_3
-#define HMM_INC_CNFG_3
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 4
-
-#ifndef HMM_INC_CNFG_4
-#define HMM_INC_CNFG_4
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 5
-
-#ifndef HMM_INC_CNFG_5
-#define HMM_INC_CNFG_5
-#define HMM_PROCESS
-#endif
-
-#endif
-
-#ifdef HMM_PROCESS
-
-#include "hmm_cnfg.h"
-
-/* Heap descriptor. */
-typedef struct HMM_UNIQUE(structure) {
-  /* private: */
-
-  /* Pointer to (payload of) root node in AVL tree.  This field should
-  ** really be the AVL tree descriptor (type avl_avl).  But (in the
-  ** instantiation of the AVL tree generic package used in package) the
-  ** AVL tree descriptor simply contains a pointer to the root.  So,
-  ** whenever a pointer to the AVL tree descriptor is needed, I use the
-  ** cast:
-  **
-  ** (avl_avl *) &(heap_desc->avl_tree_root)
-  **
-  ** (where heap_desc is a pointer to a heap descriptor).  This trick
-  ** allows me to avoid including cavl_if.h in this external header. */
-  void *avl_tree_root;
-
-  /* Pointer to first byte of last block freed, after any coalescing. */
-  void *last_freed;
-
-  /* public: */
-
-  HMM_UNIQUE(size_bau) num_baus_can_shrink;
-  void *end_of_shrinkable_chunk;
-}
-HMM_UNIQUE(descriptor);
-
-/* Prototypes for externally-callable functions. */
-
-void HMM_UNIQUE(init)(HMM_UNIQUE(descriptor) *desc);
-
-void *HMM_UNIQUE(alloc)(
-  HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-void *HMM_UNIQUE(greedy_alloc)(
-  HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) needed_addr_align_units,
-  HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-int HMM_UNIQUE(resize)(
-  HMM_UNIQUE(descriptor) *desc, void *mem,
-  HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-int HMM_UNIQUE(greedy_resize)(
-  HMM_UNIQUE(descriptor) *desc, void *mem,
-  HMM_UNIQUE(size_aau) needed_addr_align_units,
-  HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-void HMM_UNIQUE(free)(HMM_UNIQUE(descriptor) *desc, void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(true_size)(void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(largest_available)(
-  HMM_UNIQUE(descriptor) *desc);
-
-void HMM_UNIQUE(new_chunk)(
-  HMM_UNIQUE(descriptor) *desc, void *start_of_chunk,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-void HMM_UNIQUE(grow_chunk)(
-  HMM_UNIQUE(descriptor) *desc, void *end_of_chunk,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-/* NOT YET IMPLEMENTED */
-void HMM_UNIQUE(shrink_chunk)(
-  HMM_UNIQUE(descriptor) *desc,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-#endif /* defined HMM_PROCESS */
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_

diff --git a/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h b/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
deleted file mode 100644
index caa8713..0000000
--- a/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
+++ /dev/null

@@ -1,120 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* Configure Heap Memory Manager for processor architecture, compiler,
-** and desired performance characteristics.  This file is included
-** by heapmm.h, so these definitions can be used by code external to
-** HMM.  You can change the default configuration, and/or create alternate
-** configuration(s).
-*/
-
-/* To allow for multiple configurations of HMM to be used in the same
-** compilation unit, undefine all preprocessor symbols that will be
-** defined below.
-*/
-#undef HMM_ADDR_ALIGN_UNIT
-#undef HMM_BLOCK_ALIGN_UNIT
-#undef HMM_UNIQUE
-#undef HMM_DESC_PARAM
-#undef HMM_SYM_TO_STRING
-#undef HMM_SYM_TO_STRING
-#undef HMM_AUDIT_FAIL
-
-/* Turn X into a string after one macro expansion pass of X.  This trick
-** works with both GCC and Visual C++. */
-#define HMM_SYM_TO_STRING(X) HMM_SYM_TO_STRING(X)
-#define HMM_SYM_TO_STRING(X) #X
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-/* Use hmm_ prefix to avoid identifier conflicts. */
-#define HMM_UNIQUE(BASE) hmm_ ## BASE
-
-/* Number of bytes in an Address Alignment Unit (AAU). */
-// fwg
-// #define HMM_ADDR_ALIGN_UNIT sizeof(int)
-#define HMM_ADDR_ALIGN_UNIT 32
-
-/* Number of AAUs in a Block Alignment Unit (BAU). */
-#define HMM_BLOCK_ALIGN_UNIT 1
-
-/* Type of unsigned integer big enough to hold the size of a Block in AAUs. */
-typedef unsigned long HMM_UNIQUE(size_aau);
-
-/* Type of unsigned integer big enough to hold the size of a Block/Chunk
-** in BAUs.  The high bit will be robbed. */
-typedef unsigned long HMM_UNIQUE(size_bau);
-
-void hmm_dflt_abort(const char *, const char *);
-
-/* Actions upon a self-audit failure.  Must expand to a single complete
-** statement.  If you remove the definition of this macro, no self-auditing
-** will be performed. */
-#define HMM_AUDIT_FAIL \
-  hmm_dflt_abort(__FILE__, HMM_SYM_TO_STRING(__LINE__));
-
-#elif HMM_CNFG_NUM == 0
-
-/* Definitions for testing. */
-
-#define HMM_UNIQUE(BASE) thmm_ ## BASE
-
-#define HMM_ADDR_ALIGN_UNIT sizeof(int)
-
-#define HMM_BLOCK_ALIGN_UNIT 3
-
-typedef unsigned HMM_UNIQUE(size_aau);
-
-typedef unsigned short HMM_UNIQUE(size_bau);
-
-/* Under this test setup, a long jump is done if there is a self-audit
-** failure.
-*/
-
-extern jmp_buf HMM_UNIQUE(jmp_buf);
-extern const char *HMM_UNIQUE(fail_file);
-extern unsigned HMM_UNIQUE(fail_line);
-
-#define HMM_AUDIT_FAIL \
-  { HMM_UNIQUE(fail_file) = __FILE__; HMM_UNIQUE(fail_line) = __LINE__; \
-    longjmp(HMM_UNIQUE(jmp_buf), 1); }
-
-#elif HMM_CNFG_NUM == 1
-
-/* Put configuration 1 definitions here (if there is a configuration 1). */
-
-#elif HMM_CNFG_NUM == 2
-
-/* Put configuration 2 definitions here. */
-
-#elif HMM_CNFG_NUM == 3
-
-/* Put configuration 3 definitions here. */
-
-#elif HMM_CNFG_NUM == 4
-
-/* Put configuration 4 definitions here. */
-
-#elif HMM_CNFG_NUM == 5
-
-/* Put configuration 5 definitions here. */
-
-#endif
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_

diff --git a/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h b/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
deleted file mode 100644
index 7302aa2..0000000
--- a/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
+++ /dev/null

@@ -1,159 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-
-#ifdef __uClinux__
-# include <lddk.h>
-#endif
-
-#include "heapmm.h"
-
-#define U(BASE) HMM_UNIQUE(BASE)
-
-/* Mask of high bit of variable of size_bau type. */
-#define HIGH_BIT_BAU_SIZE \
-  ((U(size_bau)) ~ (((U(size_bau)) ~ (U(size_bau)) 0) >> 1))
-
-/* Add a given number of AAUs to pointer. */
-#define AAUS_FORWARD(PTR, AAU_OFFSET) \
-  (((char *) (PTR)) + ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Subtract a given number of AAUs from pointer. */
-#define AAUS_BACKWARD(PTR, AAU_OFFSET) \
-  (((char *) (PTR)) - ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Add a given number of BAUs to a pointer. */
-#define BAUS_FORWARD(PTR, BAU_OFFSET) \
-  AAUS_FORWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-/* Subtract a given number of BAUs to a pointer. */
-#define BAUS_BACKWARD(PTR, BAU_OFFSET) \
-  AAUS_BACKWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-typedef struct head_struct {
-  /* Sizes in Block Alignment Units. */
-  HMM_UNIQUE(size_bau) previous_block_size, block_size;
-}
-head_record;
-
-typedef struct ptr_struct {
-  struct ptr_struct *self, *prev, *next;
-}
-ptr_record;
-
-/* Divide and round up any fraction to the next whole number. */
-#define DIV_ROUND_UP(NUMER, DENOM) (((NUMER) + (DENOM) - 1) / (DENOM))
-
-/* Number of AAUs in a block head. */
-#define HEAD_AAUS DIV_ROUND_UP(sizeof(head_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of AAUs in a block pointer record. */
-#define PTR_RECORD_AAUS DIV_ROUND_UP(sizeof(ptr_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of BAUs in a dummy end record (at end of chunk). */
-#define DUMMY_END_BLOCK_BAUS DIV_ROUND_UP(HEAD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Minimum number of BAUs in a block (allowing room for the pointer record. */
-#define MIN_BLOCK_BAUS \
-  DIV_ROUND_UP(HEAD_AAUS + PTR_RECORD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Return number of BAUs in block (masking off high bit containing block
-** status). */
-#define BLOCK_BAUS(HEAD_PTR) \
-  (((head_record *) (HEAD_PTR))->block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Return number of BAUs in previous block (masking off high bit containing
-** block status). */
-#define PREV_BLOCK_BAUS(HEAD_PTR) \
-  (((head_record *) (HEAD_PTR))->previous_block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Set number of BAUs in previous block, preserving high bit containing
-** block status. */
-#define SET_PREV_BLOCK_BAUS(HEAD_PTR, N_BAUS) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->previous_block_size &= HIGH_BIT_BAU_SIZE; \
-    h_ptr->previous_block_size |= (N_BAUS); }
-
-/* Convert pointer to pointer record of block to pointer to block's head
-** record. */
-#define PTR_REC_TO_HEAD(PTR_REC_PTR) \
-  ((head_record *) AAUS_BACKWARD(PTR_REC_PTR, HEAD_AAUS))
-
-/* Convert pointer to block head to pointer to block's pointer record. */
-#define HEAD_TO_PTR_REC(HEAD_PTR) \
-  ((ptr_record *) AAUS_FORWARD(HEAD_PTR, HEAD_AAUS))
-
-/* Returns non-zero if block is allocated. */
-#define IS_BLOCK_ALLOCATED(HEAD_PTR) \
-  (((((head_record *) (HEAD_PTR))->block_size | \
-     ((head_record *) (HEAD_PTR))->previous_block_size) & \
-    HIGH_BIT_BAU_SIZE) == 0)
-
-#define MARK_BLOCK_ALLOCATED(HEAD_PTR) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->block_size &= ~HIGH_BIT_BAU_SIZE; \
-    h_ptr->previous_block_size &= ~HIGH_BIT_BAU_SIZE; }
-
-/* Mark a block as free when it is not the first block in a bin (and
-** therefore not a node in the AVL tree). */
-#define MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(HEAD_PTR) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->block_size |= HIGH_BIT_BAU_SIZE; }
-
-/* Prototypes for internal functions implemented in one file and called in
-** another.
-*/
-
-void U(into_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void U(out_of_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void *U(alloc_from_bin)(
-  U(descriptor) *desc, ptr_record *bin_front_ptr, U(size_bau) n_baus);
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Simply contains a reference to the HMM_AUDIT_FAIL macro and a
-** dummy return. */
-int U(audit_block_fail_dummy_return)(void);
-
-
-/* Auditing a block consists of checking that the size in its head
-** matches the previous block size in the head of the next block. */
-#define AUDIT_BLOCK_AS_EXPR(HEAD_PTR) \
-  ((BLOCK_BAUS(HEAD_PTR) == \
-    PREV_BLOCK_BAUS(BAUS_FORWARD(HEAD_PTR, BLOCK_BAUS(HEAD_PTR)))) ? \
-   0 : U(audit_block_fail_dummy_return)())
-
-#define AUDIT_BLOCK(HEAD_PTR) \
-  { void *h_ptr = (HEAD_PTR); AUDIT_BLOCK_AS_EXPR(h_ptr); }
-
-#endif
-
-/* Interface to AVL tree generic package instantiation. */
-
-#define AVL_UNIQUE(BASE) U(avl_ ## BASE)
-
-#define AVL_HANDLE ptr_record *
-
-#define AVL_KEY U(size_bau)
-
-#define AVL_MAX_DEPTH 64
-
-#include "cavl_if.h"
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_

diff --git a/libvpx/vpx_mem/vpx_mem.c b/libvpx/vpx_mem/vpx_mem.c
index 059248b..c6f501a 100644
--- a/libvpx/vpx_mem/vpx_mem.c
+++ b/libvpx/vpx_mem/vpx_mem.c

@@ -16,114 +16,13 @@
 #include <stdlib.h>
 #include <string.h>
 #include "include/vpx_mem_intrnl.h"
-
-#if CONFIG_MEM_TRACKER
-#ifndef VPX_NO_GLOBALS
-static unsigned long g_alloc_count = 0;
-#else
-#include "vpx_global_handling.h"
-#define g_alloc_count vpxglobalm(vpxmem,g_alloc_count)
-#endif
-#endif
-
-#if CONFIG_MEM_MANAGER
-# include "heapmm.h"
-# include "hmm_intrnl.h"
-
-# define SHIFT_HMM_ADDR_ALIGN_UNIT 5
-# define TOTAL_MEMORY_TO_ALLOCATE  20971520 /* 20 * 1024 * 1024 */
-
-# define MM_DYNAMIC_MEMORY 1
-# if MM_DYNAMIC_MEMORY
-static unsigned char *g_p_mng_memory_raw = NULL;
-static unsigned char *g_p_mng_memory     = NULL;
-# else
-static unsigned char g_p_mng_memory[TOTAL_MEMORY_TO_ALLOCATE];
-# endif
-
-static size_t g_mm_memory_size = TOTAL_MEMORY_TO_ALLOCATE;
-
-static hmm_descriptor hmm_d;
-static int g_mng_memory_allocated = 0;
-
-static int vpx_mm_create_heap_memory();
-static void *vpx_mm_realloc(void *memblk, size_t size);
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-struct GLOBAL_FUNC_POINTERS {
-  g_malloc_func g_malloc;
-  g_calloc_func g_calloc;
-  g_realloc_func g_realloc;
-  g_free_func g_free;
-  g_memcpy_func g_memcpy;
-  g_memset_func g_memset;
-  g_memmove_func g_memmove;
-} *g_func = NULL;
-
-# define VPX_MALLOC_L  g_func->g_malloc
-# define VPX_REALLOC_L g_func->g_realloc
-# define VPX_FREE_L    g_func->g_free
-# define VPX_MEMCPY_L  g_func->g_memcpy
-# define VPX_MEMSET_L  g_func->g_memset
-# define VPX_MEMMOVE_L g_func->g_memmove
-#else
-# define VPX_MALLOC_L  malloc
-# define VPX_REALLOC_L realloc
-# define VPX_FREE_L    free
-# define VPX_MEMCPY_L  memcpy
-# define VPX_MEMSET_L  memset
-# define VPX_MEMMOVE_L memmove
-#endif /* USE_GLOBAL_FUNCTION_POINTERS */
-
-unsigned int vpx_mem_get_version() {
-  unsigned int ver = ((unsigned int)(unsigned char)VPX_MEM_VERSION_CHIEF << 24 |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_MAJOR << 16 |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_MINOR << 8  |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_PATCH);
-  return ver;
-}
-
-int vpx_mem_set_heap_size(size_t size) {
-  int ret = -1;
-
-#if CONFIG_MEM_MANAGER
-#if MM_DYNAMIC_MEMORY
-
-  if (!g_mng_memory_allocated && size) {
-    g_mm_memory_size = size;
-    ret = 0;
-  } else
-    ret = -3;
-
-#else
-  ret = -2;
-#endif
-#else
-  (void)size;
-#endif
-
-  return ret;
-}
+#include "vpx/vpx_integer.h"
 
 void *vpx_memalign(size_t align, size_t size) {
   void *addr,
        * x = NULL;
 
-#if CONFIG_MEM_MANAGER
-  int number_aau;
-
-  if (vpx_mm_create_heap_memory() < 0) {
-    _P(printf("[vpx][mm] ERROR vpx_memalign() Couldn't create memory for Heap.\n");)
-  }
-
-  number_aau = ((size + align - 1 + ADDRESS_STORAGE_SIZE) >>
-                SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-  addr = hmm_alloc(&hmm_d, number_aau);
-#else
-  addr = VPX_MALLOC_L(size + align - 1 + ADDRESS_STORAGE_SIZE);
-#endif /*CONFIG_MEM_MANAGER*/
+  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
 
   if (addr) {
     x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
@@ -144,7 +43,7 @@
   x = vpx_memalign(DEFAULT_ALIGNMENT, num * size);
 
   if (x)
-    VPX_MEMSET_L(x, 0, num * size);
+    memset(x, 0, num * size);
 
   return x;
 }
@@ -170,11 +69,7 @@
     addr   = (void *)(((size_t *)memblk)[-1]);
     memblk = NULL;
 
-#if CONFIG_MEM_MANAGER
-    new_addr = vpx_mm_realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
-#else
-    new_addr = VPX_REALLOC_L(addr, size + align + ADDRESS_STORAGE_SIZE);
-#endif
+    new_addr = realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
 
     if (new_addr) {
       addr = new_addr;
@@ -192,466 +87,17 @@
 void vpx_free(void *memblk) {
   if (memblk) {
     void *addr = (void *)(((size_t *)memblk)[-1]);
-#if CONFIG_MEM_MANAGER
-    hmm_free(&hmm_d, addr);
-#else
-    VPX_FREE_L(addr);
-#endif
+    free(addr);
   }
 }
 
-#if CONFIG_MEM_TRACKER
-void *xvpx_memalign(size_t align, size_t size, char *file, int line) {
-#if TRY_BOUNDS_CHECK
-  unsigned char *x_bounds;
-#endif
-
-  void *x;
-
-  if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
-    int i_rv = vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE);
-#else
-    int i_rv = vpx_memory_tracker_init(0, 0);
-#endif
-
-    if (i_rv < 0) {
-      _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-    }
-  }
-
-#if TRY_BOUNDS_CHECK
-  {
-    int i;
-    unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-    x_bounds = vpx_memalign(align, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-    if (x_bounds) {
-      /*we're aligning the address twice here but to keep things
-        consistent we want to have the padding come before the stored
-        address so no matter what free function gets called we will
-        attempt to free the correct address*/
-      x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-      x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                     (int)align);
-      /* save the actual malloc address */
-      ((size_t *)x)[-1] = (size_t)x_bounds;
-
-      for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
-        VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-        VPX_MEMCPY_L((unsigned char *)x + size + i,
-                     &tempme, sizeof(unsigned int));
-      }
-    } else
-      x = NULL;
-  }
-#else
-  x = vpx_memalign(align, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
-  g_alloc_count++;
-
-  vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
-
-  return x;
+#if CONFIG_VP9_HIGHBITDEPTH
+void *vpx_memset16(void *dest, int val, size_t length) {
+  int i;
+  void *orig = dest;
+  uint16_t *dest16 = dest;
+  for (i = 0; i < length; i++)
+    *dest16++ = val;
+  return orig;
 }
-
-void *xvpx_malloc(size_t size, char *file, int line) {
-  return xvpx_memalign(DEFAULT_ALIGNMENT, size, file, line);
-}
-
-void *xvpx_calloc(size_t num, size_t size, char *file, int line) {
-  void *x = xvpx_memalign(DEFAULT_ALIGNMENT, num * size, file, line);
-
-  if (x)
-    VPX_MEMSET_L(x, 0, num * size);
-
-  return x;
-}
-
-void *xvpx_realloc(void *memblk, size_t size, char *file, int line) {
-  struct mem_block *p = NULL;
-  int orig_size = 0,
-      orig_line = 0;
-  char *orig_file = NULL;
-
-#if TRY_BOUNDS_CHECK
-  unsigned char *x_bounds = memblk ?
-                            (unsigned char *)(((size_t *)memblk)[-1]) :
-                            NULL;
-#endif
-
-  void *x;
-
-  if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
-
-    if (!vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE))
-#else
-    if (!vpx_memory_tracker_init(0, 0))
-#endif
-    {
-      _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-    }
-  }
-
-  if ((p = vpx_memory_tracker_find((size_t)memblk))) {
-    orig_size = p->size;
-    orig_file = p->file;
-    orig_line = p->line;
-  }
-
-#if TRY_BOUNDS_CHECK_ON_FREE
-  vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-  /* have to do this regardless of success, because
-   * the memory that does get realloc'd may change
-   * the bounds values of this block
-   */
-  vpx_memory_tracker_remove((size_t)memblk);
-
-#if TRY_BOUNDS_CHECK
-  {
-    int i;
-    unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-    x_bounds = vpx_realloc(memblk, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-    if (x_bounds) {
-      x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-      x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                     (int)DEFAULT_ALIGNMENT);
-      /* save the actual malloc address */
-      ((size_t *)x)[-1] = (size_t)x_bounds;
-
-      for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
-        VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-        VPX_MEMCPY_L((unsigned char *)x + size + i,
-                     &tempme, sizeof(unsigned int));
-      }
-    } else
-      x = NULL;
-  }
-#else
-  x = vpx_realloc(memblk, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
-  if (!memblk) ++g_alloc_count;
-
-  if (x)
-    vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
-  else
-    vpx_memory_tracker_add((size_t)memblk, orig_size, orig_file, orig_line, 1);
-
-  return x;
-}
-
-void xvpx_free(void *p_address, char *file, int line) {
-#if TRY_BOUNDS_CHECK
-  unsigned char *p_bounds_address = (unsigned char *)p_address;
-  /*p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;*/
-#endif
-
-#if !TRY_BOUNDS_CHECK_ON_FREE
-  (void)file;
-  (void)line;
-#endif
-
-  if (p_address) {
-#if TRY_BOUNDS_CHECK_ON_FREE
-    vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-    /* if the addr isn't found in the list, assume it was allocated via
-     * vpx_ calls not xvpx_, therefore it does not contain any padding
-     */
-    if (vpx_memory_tracker_remove((size_t)p_address) == -2) {
-      p_bounds_address = p_address;
-      _P(fprintf(stderr, "[vpx_mem][xvpx_free] addr: %p not found in"
-                 " list; freed from file:%s"
-                 " line:%d\n", p_address, file, line));
-    } else
-      --g_alloc_count;
-
-#if TRY_BOUNDS_CHECK
-    vpx_free(p_bounds_address);
-#else
-    vpx_free(p_address);
-#endif
-
-    if (!g_alloc_count)
-      vpx_memory_tracker_destroy();
-  }
-}
-
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if CONFIG_MEM_CHECKS
-#if defined(VXWORKS)
-#include <task_lib.h> /*for task_delay()*/
-/* This function is only used to get a stack trace of the player
-object so we can se where we are having a problem. */
-static int get_my_tt(int task) {
-  tt(task);
-
-  return 0;
-}
-
-static void vx_sleep(int msec) {
-  int ticks_to_sleep = 0;
-
-  if (msec) {
-    int msec_per_tick = 1000 / sys_clk_rate_get();
-
-    if (msec < msec_per_tick)
-      ticks_to_sleep++;
-    else
-      ticks_to_sleep = msec / msec_per_tick;
-  }
-
-  task_delay(ticks_to_sleep);
-}
-#endif
-#endif
-
-void *vpx_memcpy(void *dest, const void *source, size_t length) {
-#if CONFIG_MEM_CHECKS
-
-  if (((int)dest < 0x4000) || ((int)source < 0x4000)) {
-    _P(printf("WARNING: vpx_memcpy dest:0x%x source:0x%x len:%d\n", (int)dest, (int)source, length);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMCPY_L(dest, source, length);
-}
-
-void *vpx_memset(void *dest, int val, size_t length) {
-#if CONFIG_MEM_CHECKS
-
-  if ((int)dest < 0x4000) {
-    _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", (int)dest, val, length);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMSET_L(dest, val, length);
-}
-
-void *vpx_memmove(void *dest, const void *src, size_t count) {
-#if CONFIG_MEM_CHECKS
-
-  if (((int)dest < 0x4000) || ((int)src < 0x4000)) {
-    _P(printf("WARNING: vpx_memmove dest:0x%x src:0x%x count:%d\n", (int)dest, (int)src, count);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMMOVE_L(dest, src, count);
-}
-
-#if CONFIG_MEM_MANAGER
-
-static int vpx_mm_create_heap_memory() {
-  int i_rv = 0;
-
-  if (!g_mng_memory_allocated) {
-#if MM_DYNAMIC_MEMORY
-    g_p_mng_memory_raw =
-      (unsigned char *)malloc(g_mm_memory_size + HMM_ADDR_ALIGN_UNIT);
-
-    if (g_p_mng_memory_raw) {
-      g_p_mng_memory = (unsigned char *)((((unsigned int)g_p_mng_memory_raw) +
-                                          HMM_ADDR_ALIGN_UNIT - 1) &
-                                         -(int)HMM_ADDR_ALIGN_UNIT);
-
-      _P(printf("[vpx][mm] total memory size:%d g_p_mng_memory_raw:0x%x g_p_mng_memory:0x%x\n"
-, g_mm_memory_size + HMM_ADDR_ALIGN_UNIT
-, (unsigned int)g_p_mng_memory_raw
-, (unsigned int)g_p_mng_memory);)
-    } else {
-      _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
-      i_rv = -1;
-    }
-
-    if (g_p_mng_memory)
-#endif
-    {
-      int chunk_size = 0;
-
-      g_mng_memory_allocated = 1;
-
-      hmm_init(&hmm_d);
-
-      chunk_size = g_mm_memory_size >> SHIFT_HMM_ADDR_ALIGN_UNIT;
-
-      chunk_size -= DUMMY_END_BLOCK_BAUS;
-
-      _P(printf("[vpx][mm] memory size:%d for vpx memory manager. g_p_mng_memory:0x%x  chunk_size:%d\n"
-, g_mm_memory_size
-, (unsigned int)g_p_mng_memory
-, chunk_size);)
-
-      hmm_new_chunk(&hmm_d, (void *)g_p_mng_memory, chunk_size);
-    }
-
-#if MM_DYNAMIC_MEMORY
-    else {
-      _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
-      i_rv = -1;
-    }
-
-#endif
-  }
-
-  return i_rv;
-}
-
-static void *vpx_mm_realloc(void *memblk, size_t size) {
-  void *p_ret = NULL;
-
-  if (vpx_mm_create_heap_memory() < 0) {
-    _P(printf("[vpx][mm] ERROR vpx_mm_realloc() Couldn't create memory for Heap.\n");)
-  } else {
-    int i_rv = 0;
-    int old_num_aaus;
-    int new_num_aaus;
-
-    old_num_aaus = hmm_true_size(memblk);
-    new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-    if (old_num_aaus == new_num_aaus) {
-      p_ret = memblk;
-    } else {
-      i_rv = hmm_resize(&hmm_d, memblk, new_num_aaus);
-
-      if (i_rv == 0) {
-        p_ret = memblk;
-      } else {
-        /* Error. Try to malloc and then copy data. */
-        void *p_from_malloc;
-
-        new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-        p_from_malloc  = hmm_alloc(&hmm_d, new_num_aaus);
-
-        if (p_from_malloc) {
-          vpx_memcpy(p_from_malloc, memblk, size);
-          hmm_free(&hmm_d, memblk);
-
-          p_ret = p_from_malloc;
-        }
-      }
-    }
-  }
-
-  return p_ret;
-}
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-# if CONFIG_MEM_TRACKER
-extern int vpx_memory_tracker_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
-# endif
-#endif /*USE_GLOBAL_FUNCTION_POINTERS*/
-int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  /* If use global functions is turned on then the
-  application must set the global functions before
-  it does anything else or vpx_mem will have
-  unpredictable results. */
-  if (!g_func) {
-    g_func = (struct GLOBAL_FUNC_POINTERS *)
-             g_malloc_l(sizeof(struct GLOBAL_FUNC_POINTERS));
-
-    if (!g_func) {
-      return -1;
-    }
-  }
-
-#if CONFIG_MEM_TRACKER
-  {
-    int rv = 0;
-    rv = vpx_memory_tracker_set_functions(g_malloc_l
-, g_calloc_l
-, g_realloc_l
-, g_free_l
-, g_memcpy_l
-, g_memset_l
-, g_memmove_l);
-
-    if (rv < 0) {
-      return rv;
-    }
-  }
-#endif
-
-  g_func->g_malloc  = g_malloc_l;
-  g_func->g_calloc  = g_calloc_l;
-  g_func->g_realloc = g_realloc_l;
-  g_func->g_free    = g_free_l;
-  g_func->g_memcpy  = g_memcpy_l;
-  g_func->g_memset  = g_memset_l;
-  g_func->g_memmove = g_memmove_l;
-
-  return 0;
-#else
-  (void)g_malloc_l;
-  (void)g_calloc_l;
-  (void)g_realloc_l;
-  (void)g_free_l;
-  (void)g_memcpy_l;
-  (void)g_memset_l;
-  (void)g_memmove_l;
-  return -1;
-#endif
-}
-
-int vpx_mem_unset_functions() {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  if (g_func) {
-    g_free_func temp_free = g_func->g_free;
-    temp_free(g_func);
-    g_func = NULL;
-  }
-
-#endif
-  return 0;
-}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/libvpx/vpx_mem/vpx_mem.h b/libvpx/vpx_mem/vpx_mem.h
index 33686b2..a006e0f 100644
--- a/libvpx/vpx_mem/vpx_mem.h
+++ b/libvpx/vpx_mem/vpx_mem.h

@@ -17,27 +17,6 @@
 # include <lddk.h>
 #endif
 
-/* vpx_mem version info */
-#define vpx_mem_version "2.2.1.5"
-
-#define VPX_MEM_VERSION_CHIEF 2
-#define VPX_MEM_VERSION_MAJOR 2
-#define VPX_MEM_VERSION_MINOR 1
-#define VPX_MEM_VERSION_PATCH 5
-/* end - vpx_mem version info */
-
-#ifndef VPX_TRACK_MEM_USAGE
-# define VPX_TRACK_MEM_USAGE       0  /* enable memory tracking/integrity checks */
-#endif
-#ifndef VPX_CHECK_MEM_FUNCTIONS
-# define VPX_CHECK_MEM_FUNCTIONS   0  /* enable basic safety checks in _memcpy,
-_memset, and _memmove */
-#endif
-#ifndef REPLACE_BUILTIN_FUNCTIONS
-# define REPLACE_BUILTIN_FUNCTIONS 0  /* replace builtin functions with their
-vpx_ equivalents */
-#endif
-
 #include <stdlib.h>
 #include <stddef.h>
 
@@ -45,122 +24,17 @@
 extern "C" {
 #endif
 
-  /*
-      vpx_mem_get_version()
-      provided for runtime version checking. Returns an unsigned int of the form
-      CHIEF | MAJOR | MINOR | PATCH, where the chief version number is the high
-      order byte.
-  */
-  unsigned int vpx_mem_get_version(void);
-
-  /*
-      vpx_mem_set_heap_size(size_t size)
-        size - size in bytes for the memory manager to allocate for its heap
-      Sets the memory manager's initial heap size
-      Return:
-        0: on success
-        -1: if memory manager calls have not been included in the vpx_mem lib
-        -2: if the memory manager has been compiled to use static memory
-        -3: if the memory manager has already allocated its heap
-  */
-  int vpx_mem_set_heap_size(size_t size);
-
   void *vpx_memalign(size_t align, size_t size);
   void *vpx_malloc(size_t size);
   void *vpx_calloc(size_t num, size_t size);
   void *vpx_realloc(void *memblk, size_t size);
   void vpx_free(void *memblk);
 
-  void *vpx_memcpy(void *dest, const void *src, size_t length);
-  void *vpx_memset(void *dest, int val, size_t length);
-  void *vpx_memmove(void *dest, const void *src, size_t count);
-
-  /* special memory functions */
-  void *vpx_mem_alloc(int id, size_t size, size_t align);
-  void vpx_mem_free(int id, void *mem, size_t size);
-
-  /* Wrappers to standard library functions. */
-  typedef void *(* g_malloc_func)(size_t);
-  typedef void *(* g_calloc_func)(size_t, size_t);
-  typedef void *(* g_realloc_func)(void *, size_t);
-  typedef void (* g_free_func)(void *);
-  typedef void *(* g_memcpy_func)(void *, const void *, size_t);
-  typedef void *(* g_memset_func)(void *, int, size_t);
-  typedef void *(* g_memmove_func)(void *, const void *, size_t);
-
-  int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
-  int vpx_mem_unset_functions(void);
-
-
-  /* some defines for backward compatibility */
-#define DMEM_GENERAL 0
-
-// (*)<
-
-#if REPLACE_BUILTIN_FUNCTIONS
-# ifndef __VPX_MEM_C__
-#  define memalign vpx_memalign
-#  define malloc   vpx_malloc
-#  define calloc   vpx_calloc
-#  define realloc  vpx_realloc
-#  define free     vpx_free
-#  define memcpy   vpx_memcpy
-#  define memmove  vpx_memmove
-#  define memset   vpx_memset
-# endif
+#if CONFIG_VP9_HIGHBITDEPTH
+  void *vpx_memset16(void *dest, int val, size_t length);
 #endif
 
-#if CONFIG_MEM_TRACKER
-#include <stdarg.h>
-  /*from vpx_mem/vpx_mem_tracker.c*/
-  extern void vpx_memory_tracker_dump();
-  extern void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
-  extern int vpx_memory_tracker_set_log_type(int type, char *option);
-  extern int vpx_memory_tracker_set_log_func(void *userdata,
-                                             void(*logfunc)(void *userdata,
-                                                            const char *fmt, va_list args));
-# ifndef __VPX_MEM_C__
-#  define vpx_memalign(align, size) xvpx_memalign((align), (size), __FILE__, __LINE__)
-#  define vpx_malloc(size)          xvpx_malloc((size), __FILE__, __LINE__)
-#  define vpx_calloc(num, size)     xvpx_calloc(num, size, __FILE__, __LINE__)
-#  define vpx_realloc(addr, size)   xvpx_realloc(addr, size, __FILE__, __LINE__)
-#  define vpx_free(addr)            xvpx_free(addr, __FILE__, __LINE__)
-#  define vpx_memory_tracker_check_integrity() vpx_memory_tracker_check_integrity(__FILE__, __LINE__)
-#  define vpx_mem_alloc(id,size,align) xvpx_mem_alloc(id, size, align, __FILE__, __LINE__)
-#  define vpx_mem_free(id,mem,size) xvpx_mem_free(id, mem, size, __FILE__, __LINE__)
-# endif
-
-  void *xvpx_memalign(size_t align, size_t size, char *file, int line);
-  void *xvpx_malloc(size_t size, char *file, int line);
-  void *xvpx_calloc(size_t num, size_t size, char *file, int line);
-  void *xvpx_realloc(void *memblk, size_t size, char *file, int line);
-  void xvpx_free(void *memblk, char *file, int line);
-  void *xvpx_mem_alloc(int id, size_t size, size_t align, char *file, int line);
-  void xvpx_mem_free(int id, void *mem, size_t size, char *file, int line);
-
-#else
-# ifndef __VPX_MEM_C__
-#  define vpx_memory_tracker_dump()
-#  define vpx_memory_tracker_check_integrity()
-#  define vpx_memory_tracker_set_log_type(t,o) 0
-#  define vpx_memory_tracker_set_log_func(u,f) 0
-# endif
-#endif
-
-#if !VPX_CHECK_MEM_FUNCTIONS
-# ifndef __VPX_MEM_C__
-#  include <string.h>
-#  define vpx_memcpy  memcpy
-#  define vpx_memset  memset
-#  define vpx_memmove memmove
-# endif
-#endif
+#include <string.h>
 
 #ifdef VPX_MEM_PLTFRM
 # include VPX_MEM_PLTFRM

diff --git a/libvpx/vpx_mem/vpx_mem.mk b/libvpx/vpx_mem/vpx_mem.mk
index 4663c5a..7f275ea 100644
--- a/libvpx/vpx_mem/vpx_mem.mk
+++ b/libvpx/vpx_mem/vpx_mem.mk

@@ -2,21 +2,3 @@
 MEM_SRCS-yes += vpx_mem.c
 MEM_SRCS-yes += vpx_mem.h
 MEM_SRCS-yes += include/vpx_mem_intrnl.h
-
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += vpx_mem_tracker.c
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += include/vpx_mem_tracker.h
-
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_true.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_resize.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_shrink.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_largest.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_dflt_abort.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_base.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_intrnl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_if.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_cnfg.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/heapmm.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_impl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_grow.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_alloc.c

diff --git a/libvpx/vpx_mem/vpx_mem_tracker.c b/libvpx/vpx_mem/vpx_mem_tracker.c
deleted file mode 100644
index 613e8a1..0000000
--- a/libvpx/vpx_mem/vpx_mem_tracker.c
+++ /dev/null

@@ -1,740 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
-  vpx_mem_tracker.c
-
-  jwz 2003-09-30:
-   Stores a list of addreses, their size, and file and line they came from.
-   All exposed lib functions are prefaced by vpx_ and allow the global list
-   to be thread safe.
-   Current supported platforms are:
-    Linux, Win32, win_ce and vx_works
-   Further support can be added by defining the platform specific mutex
-   in the memory_tracker struct as well as calls to create/destroy/lock/unlock
-   the mutex in vpx_memory_tracker_init/Destroy and memory_tracker_lock_mutex/unlock_mutex
-*/
-#include "./vpx_config.h"
-
-#if defined(__uClinux__)
-# include <lddk.h>
-#endif
-
-#if HAVE_PTHREAD_H
-# include <pthread.h>
-#elif defined(WIN32) || defined(_WIN32_WCE)
-# define WIN32_LEAN_AND_MEAN
-# include <windows.h>
-# include <winbase.h>
-#elif defined(VXWORKS)
-# include <sem_lib.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // VXWORKS doesn't have a malloc/memory.h file,
-// this should pull in malloc,free,etc.
-#include <stdarg.h>
-
-#include "include/vpx_mem_tracker.h"
-
-#undef vpx_malloc   // undefine any vpx_mem macros that may affect calls to
-#undef vpx_free     // memory functions in this file
-#undef vpx_memcpy
-#undef vpx_memset
-
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  // use function pointers instead of compiled functions.
-#endif
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-static mem_track_malloc_func g_malloc   = malloc;
-static mem_track_calloc_func g_calloc   = calloc;
-static mem_track_realloc_func g_realloc = realloc;
-static mem_track_free_func g_free       = free;
-static mem_track_memcpy_func g_memcpy   = memcpy;
-static mem_track_memset_func g_memset   = memset;
-static mem_track_memmove_func g_memmove = memmove;
-# define MEM_TRACK_MALLOC g_malloc
-# define MEM_TRACK_FREE   g_free
-# define MEM_TRACK_MEMCPY g_memcpy
-# define MEM_TRACK_MEMSET g_memset
-#else
-# define MEM_TRACK_MALLOC vpx_malloc
-# define MEM_TRACK_FREE   vpx_free
-# define MEM_TRACK_MEMCPY vpx_memcpy
-# define MEM_TRACK_MEMSET vpx_memset
-#endif // USE_GLOBAL_FUNCTION_POINTERS
-
-/* prototypes for internal library functions */
-static void memtrack_log(const char *fmt, ...);
-static void memory_tracker_dump();
-static void memory_tracker_check_integrity(char *file, unsigned int line);
-static void memory_tracker_add(size_t addr, unsigned int size,
-                               char *file, unsigned int line,
-                               int padded);
-static int memory_tracker_remove(size_t addr);
-static struct mem_block *memory_tracker_find(size_t addr);
-
-#if defined(NO_MUTEX)
-# define memory_tracker_lock_mutex() (!g_b_mem_tracker_inited)
-# define memory_tracker_unlock_mutex()
-#else
-static int memory_tracker_lock_mutex();
-static int memory_tracker_unlock_mutex();
-#endif
-
-#ifndef VPX_NO_GLOBALS
-struct memory_tracker {
-  struct mem_block *head,
-      * tail;
-  int len,
-      totalsize;
-  unsigned int current_allocated,
-           max_allocated;
-
-#if HAVE_PTHREAD_H
-  pthread_mutex_t mutex;
-#elif defined(WIN32) || defined(_WIN32_WCE)
-  HANDLE mutex;
-#elif defined(VXWORKS)
-  SEM_ID mutex;
-#elif defined(NO_MUTEX)
-#else
-#error "No mutex type defined for this platform!"
-#endif
-
-  int padding_size,
-      pad_value;
-};
-
-static struct memory_tracker memtrack;   // our global memory allocation list
-static int g_b_mem_tracker_inited = 0;     // indicates whether the global list has
-// been initialized (1:yes/0:no)
-static struct {
-  FILE *file;
-  int type;
-  void (*func)(void *userdata, const char *fmt, va_list args);
-  void *userdata;
-} g_logging = {NULL, 0, NULL, NULL};
-#else
-# include "vpx_global_handling.h"
-#define g_b_mem_tracker_inited vpxglobalm(vpxmem,g_b_mem_tracker_inited)
-#define g_logging vpxglobalm(vpxmem,g_logging)
-#define memtrack vpxglobalm(vpxmem,memtrack)
-#endif // #ifndef VPX_NO_GLOBALS
-
-extern void *vpx_malloc(size_t size);
-extern void vpx_free(void *memblk);
-extern void *vpx_memcpy(void *dest, const void *src, size_t length);
-extern void *vpx_memset(void *dest, int val, size_t length);
-
-/*
- *
- * Exposed library functions
- *
-*/
-
-/*
-    vpx_memory_tracker_init(int padding_size, int pad_value)
-      padding_size - the size of the padding before and after each mem addr.
-                     Values > 0 indicate that integrity checks can be performed
-                     by inspecting these areas.
-      pad_value - the initial value within the padding area before and after
-                  each mem addr.
-
-    Initializes global memory tracker structure
-    Allocates the head of the list
-*/
-int vpx_memory_tracker_init(int padding_size, int pad_value) {
-  if (!g_b_mem_tracker_inited) {
-    if ((memtrack.head = (struct mem_block *)
-                         MEM_TRACK_MALLOC(sizeof(struct mem_block)))) {
-      int ret;
-
-      MEM_TRACK_MEMSET(memtrack.head, 0, sizeof(struct mem_block));
-
-      memtrack.tail = memtrack.head;
-
-      memtrack.current_allocated = 0;
-      memtrack.max_allocated     = 0;
-
-      memtrack.padding_size = padding_size;
-      memtrack.pad_value    = pad_value;
-
-#if HAVE_PTHREAD_H
-      ret = pthread_mutex_init(&memtrack.mutex,
-                               NULL);            /*mutex attributes (NULL=default)*/
-#elif defined(WIN32) || defined(_WIN32_WCE)
-      memtrack.mutex = CreateMutex(NULL,   /*security attributes*/
-                                   FALSE,  /*we don't want initial ownership*/
-                                   NULL);  /*mutex name*/
-      ret = !memtrack.mutex;
-#elif defined(VXWORKS)
-      memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
-                                   SEM_FULL);  /*SEM_FULL initial state is unlocked*/
-      ret = !memtrack.mutex;
-#elif defined(NO_MUTEX)
-      ret = 0;
-#endif
-
-      if (ret) {
-        memtrack_log("vpx_memory_tracker_init: Error creating mutex!\n");
-
-        MEM_TRACK_FREE(memtrack.head);
-        memtrack.head = NULL;
-      } else {
-        memtrack_log("Memory Tracker init'd, v."vpx_mem_tracker_version" pad_size:%d pad_val:0x%x %d\n"
-, padding_size
-, pad_value
-, pad_value);
-        g_b_mem_tracker_inited = 1;
-      }
-    }
-  }
-
-  return g_b_mem_tracker_inited;
-}
-
-/*
-    vpx_memory_tracker_destroy()
-    If our global struct was initialized zeros out all its members,
-    frees memory and destroys it's mutex
-*/
-void vpx_memory_tracker_destroy() {
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p  = memtrack.head,
-                          * p2 = memtrack.head;
-
-    memory_tracker_dump();
-
-    while (p) {
-      p2 = p;
-      p  = p->next;
-
-      MEM_TRACK_FREE(p2);
-    }
-
-    memtrack.head              = NULL;
-    memtrack.tail              = NULL;
-    memtrack.len               = 0;
-    memtrack.current_allocated = 0;
-    memtrack.max_allocated     = 0;
-
-    if (!g_logging.type && g_logging.file && g_logging.file != stderr) {
-      fclose(g_logging.file);
-      g_logging.file = NULL;
-    }
-
-    memory_tracker_unlock_mutex();
-
-    g_b_mem_tracker_inited = 0;
-  }
-}
-
-/*
-    vpx_memory_tracker_add(size_t addr, unsigned int size,
-                         char * file, unsigned int line)
-      addr - memory address to be added to list
-      size - size of addr
-      file - the file addr was referenced from
-      line - the line in file addr was referenced from
-    Adds memory address addr, it's size, file and line it came from
-    to the global list via the thread safe internal library function
-*/
-void vpx_memory_tracker_add(size_t addr, unsigned int size,
-                            char *file, unsigned int line,
-                            int padded) {
-  memory_tracker_add(addr, size, file, line, padded);
-}
-
-/*
-    vpx_memory_tracker_remove(size_t addr)
-      addr - memory address to be removed from list
-    Removes addr from the global list via the thread safe
-    internal remove function
-    Return:
-      Same as described for memory_tracker_remove
-*/
-int vpx_memory_tracker_remove(size_t addr) {
-  return memory_tracker_remove(addr);
-}
-
-/*
-    vpx_memory_tracker_find(size_t addr)
-      addr - address to be found in list
-    Return:
-        If found, pointer to the memory block that matches addr
-        NULL otherwise
-*/
-struct mem_block *vpx_memory_tracker_find(size_t addr) {
-  struct mem_block *p = NULL;
-
-  if (!memory_tracker_lock_mutex()) {
-    p = memory_tracker_find(addr);
-    memory_tracker_unlock_mutex();
-  }
-
-  return p;
-}
-
-/*
-    vpx_memory_tracker_dump()
-    Locks the memory tracker's mutex and calls the internal
-    library function to dump the current contents of the
-    global memory allocation list
-*/
-void vpx_memory_tracker_dump() {
-  if (!memory_tracker_lock_mutex()) {
-    memory_tracker_dump();
-    memory_tracker_unlock_mutex();
-  }
-}
-
-/*
-    vpx_memory_tracker_check_integrity(char* file, unsigned int line)
-      file - The file name where the check was placed
-      line - The line in file where the check was placed
-    Locks the memory tracker's mutex and calls the internal
-    integrity check function to inspect every address in the global
-    memory allocation list
-*/
-void vpx_memory_tracker_check_integrity(char *file, unsigned int line) {
-  if (!memory_tracker_lock_mutex()) {
-    memory_tracker_check_integrity(file, line);
-    memory_tracker_unlock_mutex();
-  }
-}
-
-/*
-    vpx_memory_tracker_set_log_type
-    Sets the logging type for the memory tracker. Based on the value it will
-    direct its output to the appropriate place.
-    Return:
-      0: on success
-      -1: if the logging type could not be set, because the value was invalid
-          or because a file could not be opened
-*/
-int vpx_memory_tracker_set_log_type(int type, char *option) {
-  int ret = -1;
-
-  switch (type) {
-    case 0:
-      g_logging.type = 0;
-
-      if (!option) {
-        g_logging.file = stderr;
-        ret = 0;
-      } else {
-        if ((g_logging.file = fopen((char *)option, "w")))
-          ret = 0;
-      }
-
-      break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1:
-      g_logging.type = type;
-      ret = 0;
-      break;
-#endif
-    default:
-      break;
-  }
-
-  // output the version to the new logging destination
-  if (!ret)
-    memtrack_log("Memory Tracker logging initialized, "
-                 "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-  return ret;
-}
-
-/*
-    vpx_memory_tracker_set_log_func
-    Sets a logging function to be used by the memory tracker.
-    Return:
-      0: on success
-      -1: if the logging type could not be set because logfunc was NULL
-*/
-int vpx_memory_tracker_set_log_func(void *userdata,
-                                    void(*logfunc)(void *userdata,
-                                                   const char *fmt, va_list args)) {
-  int ret = -1;
-
-  if (logfunc) {
-    g_logging.type     = -1;
-    g_logging.userdata = userdata;
-    g_logging.func     = logfunc;
-    ret = 0;
-  }
-
-  // output the version to the new logging destination
-  if (!ret)
-    memtrack_log("Memory Tracker logging initialized, "
-                 "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-  return ret;
-}
-
-/*
- *
- * END - Exposed library functions
- *
-*/
-
-
-/*
- *
- * Internal library functions
- *
-*/
-
-static void memtrack_log(const char *fmt, ...) {
-  va_list list;
-
-  va_start(list, fmt);
-
-  switch (g_logging.type) {
-    case -1:
-
-      if (g_logging.func)
-        g_logging.func(g_logging.userdata, fmt, list);
-
-      break;
-    case 0:
-
-      if (g_logging.file) {
-        vfprintf(g_logging.file, fmt, list);
-        fflush(g_logging.file);
-      }
-
-      break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1: {
-      char temp[1024];
-      _vsnprintf(temp, sizeof(temp) / sizeof(char) - 1, fmt, list);
-      OutputDebugString(temp);
-    }
-    break;
-#endif
-    default:
-      break;
-  }
-
-  va_end(list);
-}
-
-/*
-    memory_tracker_dump()
-    Dumps the current contents of the global memory allocation list
-*/
-static void memory_tracker_dump() {
-  int i = 0;
-  struct mem_block *p = (memtrack.head ? memtrack.head->next : NULL);
-
-  memtrack_log("\n_currently Allocated= %d; Max allocated= %d\n",
-               memtrack.current_allocated, memtrack.max_allocated);
-
-  while (p) {
-#if defined(WIN32) && !defined(_WIN32_WCE)
-
-    /*when using outputdebugstring, output filenames so they
-      can be clicked to be opened in visual studio*/
-    if (g_logging.type == 1)
-      memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file:\n"
-                   "  %s(%d):\n", i,
-                   p->addr, i, p->size,
-                   p->file, p->line);
-    else
-#endif
-      memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file: %s, line: %d\n", i,
-                   p->addr, i, p->size,
-                   p->file, p->line);
-
-    p = p->next;
-    ++i;
-  }
-
-  memtrack_log("\n");
-}
-
-/*
-    memory_tracker_check_integrity(char* file, unsigned int file)
-      file - the file name where the check was placed
-      line - the line in file where the check was placed
-    If a padding_size was supplied to vpx_memory_tracker_init()
-    this function will check ea. addr in the list verifying that
-    addr-padding_size and addr+padding_size is filled with pad_value
-*/
-static void memory_tracker_check_integrity(char *file, unsigned int line) {
-  if (memtrack.padding_size) {
-    int i,
-        index = 0;
-    unsigned char *p_show_me,
-             * p_show_me2;
-    unsigned int tempme = memtrack.pad_value,
-                 dead1,
-                 dead2;
-    unsigned char *x_bounds;
-    struct mem_block *p = memtrack.head->next;
-
-    while (p) {
-      // x_bounds = (unsigned char*)p->addr;
-      // back up VPX_BYTE_ALIGNMENT
-      // x_bounds -= memtrack.padding_size;
-
-      if (p->padded) { // can the bounds be checked?
-        /*yes, move to the address that was actually allocated
-        by the vpx_* calls*/
-        x_bounds = (unsigned char *)(((size_t *)p->addr)[-1]);
-
-        for (i = 0; i < memtrack.padding_size; i += sizeof(unsigned int)) {
-          p_show_me = (x_bounds + i);
-          p_show_me2 = (unsigned char *)(p->addr + p->size + i);
-
-          MEM_TRACK_MEMCPY(&dead1, p_show_me, sizeof(unsigned int));
-          MEM_TRACK_MEMCPY(&dead2, p_show_me2, sizeof(unsigned int));
-
-          if ((dead1 != tempme) || (dead2 != tempme)) {
-            memtrack_log("\n[vpx_mem integrity check failed]:\n"
-                         "    index[%d,%d] {%s:%d} addr=0x%x, size=%d,"
-                         " file: %s, line: %d c0:0x%x c1:0x%x\n",
-                         index, i, file, line, p->addr, p->size, p->file,
-                         p->line, dead1, dead2);
-          }
-        }
-      }
-
-      ++index;
-      p = p->next;
-    }
-  }
-}
-
-/*
-    memory_tracker_add(size_t addr, unsigned int size,
-                     char * file, unsigned int line)
-    Adds an address (addr), it's size, file and line number to our list.
-    Adjusts the total bytes allocated and max bytes allocated if necessary.
-    If memory cannot be allocated the list will be destroyed.
-*/
-void memory_tracker_add(size_t addr, unsigned int size,
-                        char *file, unsigned int line,
-                        int padded) {
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p;
-
-    p = MEM_TRACK_MALLOC(sizeof(struct mem_block));
-
-    if (p) {
-      p->prev       = memtrack.tail;
-      p->prev->next = p;
-      p->addr       = addr;
-      p->size       = size;
-      p->line       = line;
-      p->file       = file;
-      p->padded     = padded;
-      p->next       = NULL;
-
-      memtrack.tail = p;
-
-      memtrack.current_allocated += size;
-
-      if (memtrack.current_allocated > memtrack.max_allocated)
-        memtrack.max_allocated = memtrack.current_allocated;
-
-      // memtrack_log("memory_tracker_add: added addr=0x%.8x\n", addr);
-
-      memory_tracker_unlock_mutex();
-    } else {
-      memtrack_log("memory_tracker_add: error allocating memory!\n");
-      memory_tracker_unlock_mutex();
-      vpx_memory_tracker_destroy();
-    }
-  }
-}
-
-/*
-    memory_tracker_remove(size_t addr)
-    Removes an address and its corresponding size (if they exist)
-    from the memory tracker list and adjusts the current number
-    of bytes allocated.
-    Return:
-      0: on success
-      -1: if the mutex could not be locked
-      -2: if the addr was not found in the list
-*/
-int memory_tracker_remove(size_t addr) {
-  int ret = -1;
-
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p;
-
-    if ((p = memory_tracker_find(addr))) {
-      memtrack.current_allocated -= p->size;
-
-      p->prev->next = p->next;
-
-      if (p->next)
-        p->next->prev = p->prev;
-      else
-        memtrack.tail = p->prev;
-
-      ret = 0;
-      MEM_TRACK_FREE(p);
-    } else {
-      if (addr)
-        memtrack_log("memory_tracker_remove(): addr not found in list,"
-                     " 0x%.8x\n", addr);
-
-      ret = -2;
-    }
-
-    memory_tracker_unlock_mutex();
-  }
-
-  return ret;
-}
-
-/*
-    memory_tracker_find(size_t addr)
-    Finds an address in our addrs list
-    NOTE: the mutex MUST be locked in the other internal
-          functions before calling this one. This avoids
-          the need for repeated locking and unlocking as in Remove
-    Returns: pointer to the mem block if found, NULL otherwise
-*/
-static struct mem_block *memory_tracker_find(size_t addr) {
-  struct mem_block *p = NULL;
-
-  if (memtrack.head) {
-    p = memtrack.head->next;
-
-    while (p && (p->addr != addr))
-      p = p->next;
-  }
-
-  return p;
-}
-
-
-#if !defined(NO_MUTEX)
-/*
-    memory_tracker_lock_mutex()
-    Locks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to lock the mutex failed
-*/
-static int memory_tracker_lock_mutex() {
-  int ret = -1;
-
-  if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
-    ret = pthread_mutex_lock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-    ret = WaitForSingleObject(memtrack.mutex, INFINITE);
-#elif defined(VXWORKS)
-    ret = sem_take(memtrack.mutex, WAIT_FOREVER);
-#endif
-
-    if (ret) {
-      memtrack_log("memory_tracker_lock_mutex: mutex lock failed\n");
-    }
-  }
-
-  return ret;
-}
-
-/*
-    memory_tracker_unlock_mutex()
-    Unlocks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to unlock the mutex failed
-*/
-static int memory_tracker_unlock_mutex() {
-  int ret = -1;
-
-  if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
-    ret = pthread_mutex_unlock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-    ret = !ReleaseMutex(memtrack.mutex);
-#elif defined(VXWORKS)
-    ret = sem_give(memtrack.mutex);
-#endif
-
-    if (ret) {
-      memtrack_log("memory_tracker_unlock_mutex: mutex unlock failed\n");
-    }
-  }
-
-  return ret;
-}
-#endif
-
-/*
-    vpx_memory_tracker_set_functions
-
-    Sets the function pointers for the standard library functions.
-
-    Return:
-      0: on success
-      -1: if the use global function pointers is not set.
-*/
-int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  if (g_malloc_l)
-    g_malloc = g_malloc_l;
-
-  if (g_calloc_l)
-    g_calloc = g_calloc_l;
-
-  if (g_realloc_l)
-    g_realloc = g_realloc_l;
-
-  if (g_free_l)
-    g_free = g_free_l;
-
-  if (g_memcpy_l)
-    g_memcpy = g_memcpy_l;
-
-  if (g_memset_l)
-    g_memset = g_memset_l;
-
-  if (g_memmove_l)
-    g_memmove = g_memmove_l;
-
-  return 0;
-#else
-  (void)g_malloc_l;
-  (void)g_calloc_l;
-  (void)g_realloc_l;
-  (void)g_free_l;
-  (void)g_memcpy_l;
-  (void)g_memset_l;
-  (void)g_memmove_l;
-  return -1;
-#endif
-}

diff --git a/libvpx/vpx_ports/arm.h b/libvpx/vpx_ports/arm.h
index 1e4a8e2..42c98f5 100644
--- a/libvpx/vpx_ports/arm.h
+++ b/libvpx/vpx_ports/arm.h

@@ -27,6 +27,12 @@
 
 int arm_cpu_caps(void);
 
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+#define VPX_INCOMPATIBLE_GCC
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/libvpx/vpx_ports/arm_cpudetect.c b/libvpx/vpx_ports/arm_cpudetect.c
index fa0e030..8a4b8af 100644
--- a/libvpx/vpx_ports/arm_cpudetect.c
+++ b/libvpx/vpx_ports/arm_cpudetect.c

@@ -10,7 +10,8 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "arm.h"
+#include "vpx_ports/arm.h"
+#include "./vpx_config.h"
 
 #ifdef WINAPI_FAMILY
 #include <winapifamily.h>
@@ -48,15 +49,12 @@
     return flags;
   }
   mask = arm_cpu_env_mask();
-#if HAVE_EDSP
-  flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
   flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON  || HAVE_NEON_ASM */
   return flags & mask;
 }
 
@@ -77,16 +75,6 @@
    *  instructions via their assembled hex code.
    * All of these instructions should be essentially nops.
    */
-#if HAVE_EDSP
-  if (mask & HAS_EDSP) {
-    __try {
-      /*PLD [r13]*/
-      __emit(0xF5DDF000);
-      flags |= HAS_EDSP;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
 #if HAVE_MEDIA
   if (mask & HAS_MEDIA)
     __try {
@@ -97,7 +85,8 @@
     /*Ignore exception.*/
   }
 }
-#if HAVE_NEON
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
 if (mask &HAS_NEON) {
   __try {
     /*VORR q0,q0,q0*/
@@ -107,9 +96,7 @@
     /*Ignore exception.*/
   }
 }
-#endif /* HAVE_NEON */
-#endif /* HAVE_MEDIA */
-#endif /* HAVE_EDSP */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
 return flags & mask;
 }
 
@@ -126,16 +113,13 @@
   mask = arm_cpu_env_mask();
   features = android_getCpuFeatures();
 
-#if HAVE_EDSP
-  flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
   if (features & ANDROID_CPU_ARM_FEATURE_NEON)
     flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
   return flags & mask;
 }
 
@@ -162,23 +146,15 @@
      */
     char buf[512];
     while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_EDSP || HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
       if (memcmp(buf, "Features", 8) == 0) {
         char *p;
-#if HAVE_EDSP
-        p = strstr(buf, " edsp");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_EDSP;
-        }
-#if HAVE_NEON
         p = strstr(buf, " neon");
         if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
           flags |= HAS_NEON;
         }
-#endif /* HAVE_NEON */
-#endif /* HAVE_EDSP */
       }
-#endif /* HAVE_EDSP || HAVE_NEON */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
 #if HAVE_MEDIA
       if (memcmp(buf, "CPU architecture:", 17) == 0) {
         int version;

diff --git a/libvpx/vpx_ports/asm_offsets.h b/libvpx/vpx_ports/asm_offsets.h
deleted file mode 100644
index 317bbed..0000000
--- a/libvpx/vpx_ports/asm_offsets.h
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_PORTS_ASM_OFFSETS_H_
-#define VPX_PORTS_ASM_OFFSETS_H_
-
-#include <stddef.h>
-
-#define ct_assert(name,cond) \
-  static void assert_##name(void) UNUSED;\
-  static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
-
-#if INLINE_ASM
-#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val))
-#define BEGIN int main(void) {
-#define END return 0; }
-#else
-#define DEFINE(sym, val) const int sym = val
-#define BEGIN
-#define END
-#endif
-
-#endif  // VPX_PORTS_ASM_OFFSETS_H_

diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vpx_ports/bitops.h
similarity index 68%
rename from libvpx/vp9/common/vp9_systemdependent.h
rename to libvpx/vpx_ports/bitops.h
index e971158..0d3223e 100644
--- a/libvpx/vp9/common/vp9_systemdependent.h
+++ b/libvpx/vpx_ports/bitops.h

@@ -8,47 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
-#define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
+#ifndef VPX_PORTS_BITOPS_H_
+#define VPX_PORTS_BITOPS_H_
+
+#include "vpx_ports/msvc.h"
 
 #ifdef _MSC_VER
 # include <math.h>  // the ceil() definition must precede intrin.h
 # if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
 #  include <intrin.h>
-#  define USE_MSC_INTRIN
+#  define USE_MSC_INTRINSICS
 # endif
-# define snprintf _snprintf
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "./vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
-void vpx_reset_mmx_state(void);
-#define vp9_clear_system_state() vpx_reset_mmx_state()
-#else
-#define vp9_clear_system_state()
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER < 1800
-// round is not defined in MSVC before VS2013.
-static INLINE int round(double x) {
-  if (x < 0)
-    return (int)ceil(x - 0.5);
-  else
-    return (int)floor(x + 0.5);
-}
-#endif
-
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
 static INLINE int get_msb(unsigned int n) {
   return 31 ^ __builtin_clz(n);
 }
-#elif defined(USE_MSC_INTRIN)
+#elif defined(USE_MSC_INTRINSICS)
 #pragma intrinsic(_BitScanReverse)
 
 static INLINE int get_msb(unsigned int n) {
@@ -56,7 +39,7 @@
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#undef USE_MSC_INTRIN
+#undef USE_MSC_INTRINSICS
 #else
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
@@ -80,4 +63,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
+#endif  // VPX_PORTS_BITOPS_H_

diff --git a/libvpx/vpx_ports/mem.h b/libvpx/vpx_ports/mem.h
index e91d776..7502f90 100644
--- a/libvpx/vpx_ports/mem.h
+++ b/libvpx/vpx_ports/mem.h

@@ -23,18 +23,6 @@
 #warning No alignment directives known for this compiler.
 #define DECLARE_ALIGNED(n,typ,val)  typ val
 #endif
-#endif
-
-
-/* Declare an aligned array on the stack, for situations where the stack
- * pointer may not have the alignment we expect. Creates an array with a
- * modified name, then defines val to be a pointer, and aligns that pointer
- * within the array.
- */
-#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
-  typ val##_[(n)+(a)/sizeof(typ)+1];\
-  typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
-
 
 /* Indicates that the usage of the specified variable has been audited to assure
  * that it's safe to use uninitialized. Silences 'may be used uninitialized'
@@ -44,4 +32,22 @@
 #define UNINITIALIZED_IS_SAFE(x) x=x
 #else
 #define UNINITIALIZED_IS_SAFE(x) x
+#endif
+
+#if HAVE_NEON && defined(_MSC_VER)
+#define __builtin_prefetch(x)
+#endif
+
+/* Shift down with rounding */
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // VPX_PORTS_MEM_H_

diff --git a/libvpx/vpx_ports/msvc.h b/libvpx/vpx_ports/msvc.h
new file mode 100644
index 0000000..cab7740
--- /dev/null
+++ b/libvpx/vpx_ports/msvc.h

@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MSVC_H_
+#define VPX_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "./vpx_config.h"
+
+# if _MSC_VER < 1900  // VS2015 provides snprintf
+#  define snprintf _snprintf
+# endif  // _MSC_VER < 1900
+
+#if _MSC_VER < 1800  // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+  if (x < 0)
+    return ceil(x - 0.5);
+  else
+    return floor(x + 0.5);
+}
+#endif  // _MSC_VER < 1800
+
+#endif  // _MSC_VER
+#endif  // VPX_PORTS_MSVC_H_

diff --git a/libvpx/vpx_ports/system_state.h b/libvpx/vpx_ports/system_state.h
new file mode 100644
index 0000000..086c646
--- /dev/null
+++ b/libvpx/vpx_ports/system_state.h

@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_SYSTEM_STATE_H_
+#define VPX_PORTS_SYSTEM_STATE_H_
+
+#include "./vpx_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vpx_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vpx_clear_system_state()
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // VPX_PORTS_SYSTEM_STATE_H_

diff --git a/libvpx/vpx_ports/vpx_once.h b/libvpx/vpx_ports/vpx_once.h
index bd9eebd..f1df394 100644
--- a/libvpx/vpx_ports/vpx_once.h
+++ b/libvpx/vpx_ports/vpx_once.h

@@ -110,7 +110,7 @@
 
 
 #else
-/* No-op version that performs no synchronization. vp8_rtcd() is idempotent,
+/* No-op version that performs no synchronization. *_rtcd() is idempotent,
  * so as long as your platform provides atomic loads/stores of pointers
  * no synchronization is strictly necessary.
  */

diff --git a/libvpx/vpx_ports/vpx_ports.mk b/libvpx/vpx_ports/vpx_ports.mk
index 869a204..36b1493 100644
--- a/libvpx/vpx_ports/vpx_ports.mk
+++ b/libvpx/vpx_ports/vpx_ports.mk

@@ -11,14 +11,16 @@
 
 PORTS_SRCS-yes += vpx_ports.mk
 
-PORTS_SRCS-$(BUILD_LIBVPX) += asm_offsets.h
-PORTS_SRCS-$(BUILD_LIBVPX) += mem.h
-PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h
+PORTS_SRCS-yes += bitops.h
+PORTS_SRCS-yes += mem.h
+PORTS_SRCS-yes += msvc.h
+PORTS_SRCS-yes += system_state.h
+PORTS_SRCS-yes += vpx_timer.h
 
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-PORTS_SRCS-$(BUILD_LIBVPX) += emms.asm
-PORTS_SRCS-$(BUILD_LIBVPX) += x86.h
-PORTS_SRCS-$(BUILD_LIBVPX) += x86_abi_support.asm
+PORTS_SRCS-yes += emms.asm
+PORTS_SRCS-yes += x86.h
+PORTS_SRCS-yes += x86_abi_support.asm
 endif
 
 PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c

diff --git a/libvpx/vpx_ports/vpx_timer.h b/libvpx/vpx_ports/vpx_timer.h
index 870338b..dd98e29 100644
--- a/libvpx/vpx_ports/vpx_timer.h
+++ b/libvpx/vpx_ports/vpx_timer.h

@@ -11,6 +11,9 @@
 
 #ifndef VPX_PORTS_VPX_TIMER_H_
 #define VPX_PORTS_VPX_TIMER_H_
+
+#include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
 
 #if CONFIG_OS_SUPPORT

diff --git a/libvpx/vpx_ports/x86.h b/libvpx/vpx_ports/x86.h
index 81c2b8b..5da346e 100644
--- a/libvpx/vpx_ports/x86.h
+++ b/libvpx/vpx_ports/x86.h

@@ -13,6 +13,7 @@
 #define VPX_PORTS_X86_H_
 #include <stdlib.h>
 #include "vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -104,6 +105,44 @@
 #endif
 #endif /* end others */
 
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+#include <windows.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
+#define getenv(x) NULL
+#endif
+#endif
+
 #define HAS_MMX     0x01
 #define HAS_SSE     0x02
 #define HAS_SSE2    0x04
@@ -120,7 +159,7 @@
 x86_simd_caps(void) {
   unsigned int flags = 0;
   unsigned int mask = ~0;
-  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
   char *env;
   (void)reg_ebx;
 
@@ -136,9 +175,9 @@
     mask = strtol(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
-  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_eax < 1)
+  if (max_cpuid_val < 1)
     return 0;
 
   /* Get the standard feature flags */
@@ -156,14 +195,19 @@
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
-  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
 
-  /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
-  reg_eax = 7;
-  reg_ecx = 0;
-  cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+      if (max_cpuid_val >= 7) {
+        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      }
+    }
+  }
 
   return flags & mask;
 }

diff --git a/libvpx/vpx_ports/x86_abi_support.asm b/libvpx/vpx_ports/x86_abi_support.asm
index 3814ef4..c94b76a 100644
--- a/libvpx/vpx_ports/x86_abi_support.asm
+++ b/libvpx/vpx_ports/x86_abi_support.asm

@@ -395,7 +395,7 @@
 
 ; On Android platforms use lrand48 when building postproc routines. Prior to L
 ; rand() was not available.
-%if CONFIG_POSTPROC=1
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
 %ifdef __ANDROID__
 extern sym(lrand48)
 %define LIBVPX_RAND lrand48
@@ -403,4 +403,4 @@
 extern sym(rand)
 %define LIBVPX_RAND rand
 %endif
-%endif ; CONFIG_POSTPROC
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

diff --git a/libvpx/vpx_scale/generic/gen_scalers.c b/libvpx/vpx_scale/generic/gen_scalers.c
index 5f355c5..dab324e 100644
--- a/libvpx/vpx_scale/generic/gen_scalers.c
+++ b/libvpx/vpx_scale/generic/gen_scalers.c

@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include "./vpx_scale_rtcd.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 /****************************************************************************
@@ -215,7 +215,7 @@
                                    unsigned int dest_width) {
   (void) dest_pitch;
   (void) src_pitch;
-  vpx_memcpy(dest, source, dest_width);
+  memcpy(dest, source, dest_width);
 }
 
 void vp8_vertical_band_2_1_scale_i_c(unsigned char *source,

diff --git a/libvpx/vpx_scale/generic/vpx_scale.c b/libvpx/vpx_scale/generic/vpx_scale.c
index 8044d2a..15e4ba8 100644
--- a/libvpx/vpx_scale/generic/vpx_scale.c
+++ b/libvpx/vpx_scale/generic/vpx_scale.c

@@ -22,6 +22,7 @@
 ****************************************************************************/
 #include "./vpx_scale_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vpx_scale/yv12config.h"
 
 typedef struct {
@@ -379,7 +380,7 @@
       vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
 
       if (interpolation)
-        vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+        memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
 
       /* Next band... */
       source += (unsigned long) source_band_height  * source_pitch;
@@ -432,7 +433,7 @@
                  temp_area + i * dest_pitch, 1, hratio, dest_width);
       } else { /*  Duplicate the last row */
         /* copy temp_area row 0 over from last row in the past */
-        vpx_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
       }
     }
 
@@ -443,7 +444,7 @@
     }
 
     /* copy temp_area row 0 over from last row in the past */
-    vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
 
     /* move to the next band */
     source += source_band_height * source_pitch;
@@ -498,11 +499,11 @@
 
   if (dw < (int)dst->y_width)
     for (i = 0; i < dh; i++)
-      vpx_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
+      memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
 
   if (dh < (int)dst->y_height)
     for (i = dh - 1; i < (int)dst->y_height; i++)
-      vpx_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+      memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
 
   Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
           (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -510,11 +511,11 @@
 
   if (dw / 2 < (int)dst->uv_width)
     for (i = 0; i < dst->uv_height; i++)
-      vpx_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+      memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
     for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      vpx_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+      memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
 
   Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
           (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -522,9 +523,9 @@
 
   if (dw / 2 < (int)dst->uv_width)
     for (i = 0; i < dst->uv_height; i++)
-      vpx_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+      memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int) dst->uv_height)
     for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      vpx_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+      memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
 }

diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index 827bce7..7739218 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c

@@ -10,9 +10,9 @@
 
 #include <assert.h>
 
-#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 
 /****************************************************************************
 *  Exports
@@ -36,7 +36,7 @@
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
       all of this so that a freed pointer isn't inadvertently used */
-    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   } else {
     return -1;
   }
@@ -114,10 +114,10 @@
   return -2;
 }
 
-#if CONFIG_VP9
+#if CONFIG_VP9 || CONFIG_VP10
 // TODO(jkoleszar): Maybe replace this with struct vpx_image
 
-int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   if (ybf) {
     if (ybf->buffer_alloc_sz > 0) {
       vpx_free(ybf->buffer_alloc);
@@ -126,7 +126,7 @@
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
       all of this so that a freed pointer isn't inadvertently used */
-    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   } else {
     return -1;
   }
@@ -134,25 +134,32 @@
   return 0;
 }
 
-int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height,
-                             int ss_x, int ss_y, int border,
+                             int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
+                             int border,
+                             int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
                              void *cb_priv) {
   if (ybf) {
+    const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
     const int aligned_height = (height + 7) & ~7;
     const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
     const uint64_t yplane_size = (aligned_height + 2 * border) *
-                                 (uint64_t)y_stride;
+                                 (uint64_t)y_stride + byte_alignment;
     const int uv_width = aligned_width >> ss_x;
     const int uv_height = aligned_height >> ss_y;
     const int uv_stride = y_stride >> ss_x;
     const int uv_border_w = border >> ss_x;
     const int uv_border_h = border >> ss_y;
     const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) *
-                                  (uint64_t)uv_stride;
+                                  (uint64_t)uv_stride + byte_alignment;
+
 #if CONFIG_ALPHA
     const int alpha_width = aligned_width;
     const int alpha_height = aligned_height;
@@ -160,12 +167,25 @@
     const int alpha_border_w = border;
     const int alpha_border_h = border;
     const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
-                                      (uint64_t)alpha_stride;
+                                      (uint64_t)alpha_stride + byte_alignment;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint64_t frame_size = (1 + use_highbitdepth) *
+        (yplane_size + 2 * uvplane_size + alpha_plane_size);
+#else
     const uint64_t frame_size = yplane_size + 2 * uvplane_size +
                                 alpha_plane_size;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint64_t frame_size =
+        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
 #else
     const uint64_t frame_size = yplane_size + 2 * uvplane_size;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_ALPHA
+
+    uint8_t *buf = NULL;
+
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -182,11 +202,6 @@
       if (fb->data == NULL || fb->size < external_frame_size)
         return -1;
 
-      // This memset is needed for fixing valgrind error from C loop filter
-      // due to access uninitialized memory in frame border. It could be
-      // removed if border is totally removed.
-      vpx_memset(fb->data, 0, fb->size);
-
       ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
     } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
@@ -205,7 +220,7 @@
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
       // removed if border is totally removed.
-      vpx_memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
     }
 
     /* Only support allocating buffers that have a border that's a multiple
@@ -230,19 +245,36 @@
 
     ybf->border = border;
     ybf->frame_size = (int)frame_size;
+    ybf->subsampling_x = ss_x;
+    ybf->subsampling_y = ss_y;
 
-    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
-    ybf->u_buffer = ybf->buffer_alloc + yplane_size +
-                    (uv_border_h * uv_stride) + uv_border_w;
-    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
-                    (uv_border_h * uv_stride) + uv_border_w;
+    buf = ybf->buffer_alloc;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_highbitdepth) {
+      // Store uint16 addresses when using 16bit framebuffers
+      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+    } else {
+      ybf->flags = 0;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    ybf->y_buffer = (uint8_t *)yv12_align_addr(
+        buf + (border * y_stride) + border, vp9_byte_align);
+    ybf->u_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+        vp9_byte_align);
+    ybf->v_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) +
+        uv_border_w, vp9_byte_align);
 
 #if CONFIG_ALPHA
     ybf->alpha_width = alpha_width;
     ybf->alpha_height = alpha_height;
     ybf->alpha_stride = alpha_stride;
-    ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +
-                        (alpha_border_h * alpha_stride) + alpha_border_w;
+    ybf->alpha_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + 2 * uvplane_size +
+        (alpha_border_h * alpha_stride) + alpha_border_w, vp9_byte_align);
 #endif
     ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
@@ -250,13 +282,21 @@
   return -2;
 }
 
-int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                            int width, int height,
-                           int ss_x, int ss_y, int border) {
+                           int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
+                           int border,
+                           int byte_alignment) {
   if (ybf) {
-    vp9_free_frame_buffer(ybf);
-    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
-                                    NULL, NULL, NULL);
+    vpx_free_frame_buffer(ybf);
+    return vpx_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                    use_highbitdepth,
+#endif
+                                    border, byte_alignment, NULL, NULL, NULL);
   }
   return -2;
 }

diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index 036a505..670144b 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c

@@ -10,9 +10,14 @@
 
 #include <assert.h>
 #include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 
 static void extend_plane(uint8_t *const src, int src_stride,
                          int width, int height,
@@ -28,8 +33,8 @@
   uint8_t *dst_ptr2 = src + width;
 
   for (i = 0; i < height; ++i) {
-    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_stride;
     src_ptr2 += src_stride;
     dst_ptr1 += src_stride;
@@ -45,16 +50,60 @@
   dst_ptr2 = src + src_stride * height - extend_left;
 
   for (i = 0; i < extend_top; ++i) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    memcpy(dst_ptr1, src_ptr1, linesize);
     dst_ptr1 += src_stride;
   }
 
   for (i = 0; i < extend_bottom; ++i) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    memcpy(dst_ptr2, src_ptr2, linesize);
     dst_ptr2 += src_stride;
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride,
+                              int width, int height,
+                              int extend_top, int extend_left,
+                              int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  /* copy the left and right most columns out */
+  uint16_t *src_ptr1 = src;
+  uint16_t *src_ptr2 = src + width - 1;
+  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += src_stride;
+  }
+}
+#endif
+
 void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   const int uv_border = ybf->border / 2;
 
@@ -64,6 +113,31 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(
+        ybf->y_buffer, ybf->y_stride,
+        ybf->y_crop_width, ybf->y_crop_height,
+        ybf->border, ybf->border,
+        ybf->border + ybf->y_height - ybf->y_crop_height,
+        ybf->border + ybf->y_width - ybf->y_crop_width);
+
+    extend_plane_high(
+        ybf->u_buffer, ybf->uv_stride,
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
+
+    extend_plane_high(
+        ybf->v_buffer, ybf->uv_stride,
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
+    return;
+  }
+#endif
   extend_plane(ybf->y_buffer, ybf->y_stride,
                ybf->y_crop_width, ybf->y_crop_height,
                ybf->border, ybf->border,
@@ -83,7 +157,7 @@
                uv_border + ybf->uv_width - ybf->uv_crop_width);
 }
 
-#if CONFIG_VP9
+#if CONFIG_VP9 || CONFIG_VP10
 static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   const int c_w = ybf->uv_crop_width;
   const int c_h = ybf->uv_crop_height;
@@ -99,6 +173,20 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride,
+                      ybf->y_crop_width, ybf->y_crop_height,
+                      ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    extend_plane_high(ybf->u_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    extend_plane_high(ybf->v_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    return;
+  }
+#endif
   extend_plane(ybf->y_buffer, ybf->y_stride,
                ybf->y_crop_width, ybf->y_crop_height,
                ext_size, ext_size,
@@ -112,16 +200,24 @@
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+void vpx_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   extend_frame(ybf, ybf->border);
 }
 
-void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
+void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
   const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
                        VP9INNERBORDERINPIXELS : ybf->border;
   extend_frame(ybf, inner_bw);
 }
-#endif  // CONFIG_VP9
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP9 || CONFIG_VP10
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
@@ -140,8 +236,42 @@
   assert(src_ybc->y_height == dst_ybc->y_height);
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->y_width);
+      src += src_ybc->y_stride;
+      dst += dst_ybc->y_stride;
+    }
+
+    src = src_ybc->u_buffer;
+    dst = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    src = src_ybc->v_buffer;
+    dst = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    return;
+  } else {
+    assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
+  }
+#endif
+
   for (row = 0; row < src_ybc->y_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->y_width);
+    memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
     dst += dst_ybc->y_stride;
   }
@@ -150,7 +280,7 @@
   dst = dst_ybc->u_buffer;
 
   for (row = 0; row < src_ybc->uv_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->uv_width);
+    memcpy(dst, src, src_ybc->uv_width);
     src += src_ybc->uv_stride;
     dst += dst_ybc->uv_stride;
   }
@@ -159,7 +289,7 @@
   dst = dst_ybc->v_buffer;
 
   for (row = 0; row < src_ybc->uv_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->uv_width);
+    memcpy(dst, src, src_ybc->uv_width);
     src += src_ybc->uv_stride;
     dst += dst_ybc->uv_stride;
   }
@@ -173,8 +303,21 @@
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+#endif
+
   for (row = 0; row < src_ybc->y_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->y_width);
+    memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
     dst += dst_ybc->y_stride;
   }

diff --git a/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
index 0dfc47c..aab4785 100644
--- a/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
+++ b/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c

@@ -94,12 +94,12 @@
   linesize = extend_left + extend_right + width;
 
   for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(top_dst, top_src, linesize);
+    memcpy(top_dst, top_src, linesize);
     top_dst += src_stride;
   }
 
   for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(bot_dst, bot_src, linesize);
+    memcpy(bot_dst, bot_src, linesize);
     bot_dst += src_stride;
   }
 }
@@ -132,11 +132,11 @@
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-void vp9_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
+void vpx_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
   extend_frame(ybf, ybf->border);
 }
 
-void vp9_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
+void vpx_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
   const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
                        VP9INNERBORDERINPIXELS : ybf->border;
   extend_frame(ybf, inner_bw);

diff --git a/libvpx/vpx_scale/vpx_scale.mk b/libvpx/vpx_scale/vpx_scale.mk
index 0a1594b..a49abf3 100644
--- a/libvpx/vpx_scale/vpx_scale.mk
+++ b/libvpx/vpx_scale/vpx_scale.mk

@@ -1,11 +1,10 @@
 SCALE_SRCS-yes += vpx_scale.mk
 SCALE_SRCS-yes += yv12config.h
-SCALE_SRCS-yes += vpx_scale.h
-SCALE_SRCS-yes += generic/vpx_scale.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += vpx_scale.h
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/vpx_scale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
-SCALE_SRCS-yes += vpx_scale_asm_offsets.c
 SCALE_SRCS-yes += vpx_scale_rtcd.c
 SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
@@ -14,7 +13,4 @@
 
 SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
 
-$(eval $(call asm_offsets_template,\
-	         vpx_scale_asm_offsets.asm, vpx_scale/vpx_scale_asm_offsets.c))
-
 $(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl))

diff --git a/libvpx/vpx_scale/vpx_scale_asm_offsets.c b/libvpx/vpx_scale/vpx_scale_asm_offsets.c
deleted file mode 100644
index caa9e80..0000000
--- a/libvpx/vpx_scale/vpx_scale_asm_offsets.c
+++ /dev/null

@@ -1,40 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_scale/yv12config.h"
-
-BEGIN
-
-/* vpx_scale */
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_NEON
-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
-#endif

diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.c b/libvpx/vpx_scale/vpx_scale_rtcd.c
index 656a22f..bea603f 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.c

@@ -7,9 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vpx_scale_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
 void vpx_scale_rtcd()

diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libvpx/vpx_scale/vpx_scale_rtcd.pl
index d4a2b81..56b952b 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.pl
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.pl

@@ -22,11 +22,11 @@
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
-if (vpx_config("CONFIG_VP9") eq "yes") {
-    add_proto qw/void vp9_extend_frame_borders/, "struct yv12_buffer_config *ybf";
-    specialize qw/vp9_extend_frame_borders dspr2/;
+if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+    add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_borders dspr2/;
 
-    add_proto qw/void vp9_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
-    specialize qw/vp9_extend_frame_inner_borders dspr2/;
+    add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_inner_borders dspr2/;
 }
 1;

diff --git a/libvpx/vpx_scale/win32/scaleopt.c b/libvpx/vpx_scale/win32/scaleopt.c
deleted file mode 100644
index 4336ece..0000000
--- a/libvpx/vpx_scale/win32/scaleopt.c
+++ /dev/null

@@ -1,525 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     scaleopt.cpp
-*
-*   Description  :     Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-/****************************************************************************
-*  Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-
-__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_5_4_scale_mmx
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  /*
-  unsigned i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
-  (void) dest_width;
-
-  for ( i=0; i<source_width; i+=5 )
-  {
-      a = src[0];
-      b = src[1];
-      c = src[2];
-      d = src[3];
-      e = src[4];
-
-      des[0] = a;
-      des[1] = ((b*192 + c* 64 + 128)>>8);
-      des[2] = ((c*128 + d*128 + 128)>>8);
-      des[3] = ((d* 64 + e*192 + 128)>>8);
-
-      src += 5;
-      des += 4;
-  }
-  */
-  (void) dest_width;
-
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const54_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const54_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx];
-    horizontal_line_5_4_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psrlq       mm0,        8;
-    01 02 03 04 05 06 07 xx
-    punpcklbw   mm1,        mm7;
-    xx 00 xx 01 xx 02 xx 03
-
-    punpcklbw   mm0,        mm7;
-    xx 01 xx 02 xx 03 xx 04
-    pmullw      mm1,        mm5
-
-    pmullw      mm0,        mm6
-    add         esi,        5
-
-    add         edi,        4
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-4], mm1
-
-    jl          horizontal_line_5_4_loop
-
-  }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
-__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    mov         ebx,    dest_width
-
-    vs_5_4_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    movq        mm3,    mm2
-    pmullw      mm1,    three_fourths
-
-    pmullw      mm2,    one_fourths
-    movd        mm4,    [eax+ecx]
-
-    pmullw      mm3,    two_fourths
-    punpcklbw   mm4,    mm7
-
-    movq        mm5,    mm4
-    pmullw      mm4,    two_fourths
-
-    paddw       mm1,    mm2
-    movd        mm6,    [eax+ecx*2]
-
-    pmullw      mm5,    one_fourths
-    paddw       mm1,    round_values;
-
-    paddw       mm3,    mm4
-    psrlw       mm1,    8
-
-    punpcklbw   mm6,    mm7
-    paddw       mm3,    round_values
-
-    pmullw      mm6,    three_fourths
-    psrlw       mm3,    8
-
-    packuswb    mm1,    mm7
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi], mm0
-    movd        DWORD PTR [edi+edx], mm1
-
-
-    paddw       mm5,    mm6
-    movd        DWORD PTR [edi+edx*2], mm3
-
-    lea         eax,    [edi+edx*2]
-    paddw       mm5,    round_values
-
-    psrlw       mm5,    8
-    add         edi,    4
-
-    packuswb    mm5,    mm7
-    movd        DWORD PTR [eax+edx], mm5
-
-    add         esi,    4
-    sub         ebx,    4
-
-    jg         vs_5_4_loop
-
-    pop         ebx
-  }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-
-  (void) dest_width;
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const53_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const53_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx-5];
-    horizontal_line_5_3_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    add         esi,        5
-
-    add         edi,        3
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-3], mm1
-    jl          horizontal_line_5_3_loop
-
-// exit condition
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    packuswb    mm1,        mm7
-    movd        eax,        mm1
-
-    mov         edx,        eax
-    shr         edx,        16
-
-    mov         WORD PTR[edi],   ax
-    mov         BYTE PTR[edi+2], dl
-
-  }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    movq        mm5,    one_thirds
-
-    movq        mm6,    two_thirds
-    mov         ebx,    dest_width;
-
-    vs_5_3_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    pmullw      mm1,    mm5
-    pmullw      mm2,    mm6
-
-    movd        mm3,    DWORD ptr [eax+ecx]
-    movd        mm4,    DWORD ptr [eax+ecx*2]
-
-    punpcklbw   mm3,    mm7
-    punpcklbw   mm4,    mm7
-
-    pmullw      mm3,    mm6
-    pmullw      mm4,    mm5
-
-
-    movd        DWORD PTR [edi], mm0
-    paddw       mm1,    mm2
-
-    paddw       mm1,    round_values
-    psrlw       mm1,    8
-
-    packuswb    mm1,    mm7
-    paddw       mm3,    mm4
-
-    paddw       mm3,    round_values
-    movd        DWORD PTR [edi+edx], mm1
-
-    psrlw       mm3,    8
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi+edx*2], mm3
-
-
-    add         edi,    4
-    add         esi,    4
-
-    sub         ebx,    4
-    jg          vs_5_3_loop
-
-    pop         ebx
-  }
-}
-
-
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_2_1_scale
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  (void) dest_width;
-  (void) source_width;
-  __asm {
-    mov         esi,    source
-    mov         edi,    dest
-
-    pxor        mm7,    mm7
-    mov         ecx,    dest_width
-
-    xor         edx,    edx
-    hs_2_1_loop:
-
-    movq        mm0,    [esi+edx*2]
-    psllw       mm0,    8
-
-    psrlw       mm0,    8
-    packuswb    mm0,    mm7
-
-    movd        DWORD Ptr [edi+edx], mm0;
-    add         edx,    4
-
-    cmp         edx,    ecx
-    jl          hs_2_1_loop
-
-  }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-  (void) dest_pitch;
-  (void) src_pitch;
-  vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  (void) dest_pitch;
-  __asm {
-    mov         esi,        source
-    mov         edi,        dest
-
-    mov         eax,        src_pitch
-    mov         edx,        dest_width
-
-    pxor        mm7,        mm7
-    sub         esi,        eax             // back one line
-
-
-    lea         ecx,        [esi+edx];
-    movq        mm6,        round_values;
-
-    movq        mm5,        three_sixteenths;
-    movq        mm4,        ten_sixteenths;
-
-    vs_2_1_i_loop:
-    movd        mm0,        [esi]           //
-    movd        mm1,        [esi+eax]       //
-
-    movd        mm2,        [esi+eax*2]     //
-    punpcklbw   mm0,        mm7
-
-    pmullw      mm0,        mm5
-    punpcklbw   mm1,        mm7
-
-    pmullw      mm1,        mm4
-    punpcklbw   mm2,        mm7
-
-    pmullw      mm2,        mm5
-    paddw       mm0,        round_values
-
-    paddw       mm1,        mm2
-    paddw       mm0,        mm1
-
-    psrlw       mm0,        8
-    packuswb    mm0,        mm7
-
-    movd        DWORD PTR [edi],        mm0
-    add         esi,        4
-
-    add         edi,        4;
-    cmp         esi,        ecx
-    jl          vs_2_1_i_loop
-
-  }
-}
-
-
-
-void
-register_mmxscalers(void) {
-  vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
-  vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
-  vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
-  vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
-  vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
-  vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
-  vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
-}

diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h
index cdde75c..fd5d54b 100644
--- a/libvpx/vpx_scale/yv12config.h
+++ b/libvpx/vpx_scale/yv12config.h

@@ -15,6 +15,8 @@
 extern "C" {
 #endif
 
+#include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_frame_buffer.h"
 #include "vpx/vpx_integer.h"
 
@@ -50,34 +52,48 @@
   int buffer_alloc_sz;
   int border;
   int frame_size;
+  int subsampling_x;
+  int subsampling_y;
+  unsigned int bit_depth;
+  vpx_color_space_t color_space;
 
   int corrupted;
   int flags;
 } YV12_BUFFER_CONFIG;
 
+#define YV12_FLAG_HIGHBITDEPTH 8
+
 int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                 int width, int height, int border);
 int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                   int width, int height, int border);
 int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
-int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                            int width, int height, int ss_x, int ss_y,
-                           int border);
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
+                           int border, int byte_alignment);
 
-// Updates the yv12 buffer config with the frame buffer. If cb is not
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
 // NULL, then libvpx is using the frame buffer callbacks to handle memory.
 // If cb is not NULL, libvpx will call cb with minimum size in bytes needed
 // to decode the current frame. If cb is NULL, libvpx will allocate memory
 // internally to decode the current frame. Returns 0 on success. Returns < 0
 // on failure.
-int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
                              int border,
+                             int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
                              void *cb_priv);
-int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
 }

diff --git a/libvpx/vpx_util/endian_inl.h b/libvpx/vpx_util/endian_inl.h
new file mode 100644
index 0000000..6b177f1
--- /dev/null
+++ b/libvpx/vpx_util/endian_inl.h

@@ -0,0 +1,122 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_UTIL_ENDIAN_INL_H_
+#define VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4, 3) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4, 8) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
+#define VPX_USE_MIPS32_R2
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(VPX_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // VPX_UTIL_ENDIAN_INL_H_

diff --git a/libvpx/vp9/common/vp9_thread.c b/libvpx/vpx_util/vpx_thread.c
similarity index 85%
rename from libvpx/vp9/common/vp9_thread.c
rename to libvpx/vpx_util/vpx_thread.c
index 1c6aec0..0bb0125 100644
--- a/libvpx/vp9/common/vp9_thread.c
+++ b/libvpx/vpx_util/vpx_thread.c

@@ -15,12 +15,12 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./vp9_thread.h"
+#include "./vpx_thread.h"
 #include "vpx_mem/vpx_mem.h"
 
 #if CONFIG_MULTITHREAD
 
-struct VP9WorkerImpl {
+struct VPxWorkerImpl {
   pthread_mutex_t mutex_;
   pthread_cond_t  condition_;
   pthread_t       thread_;
@@ -28,10 +28,10 @@
 
 //------------------------------------------------------------------------------
 
-static void execute(VP9Worker *const worker);  // Forward declaration.
+static void execute(VPxWorker *const worker);  // Forward declaration.
 
 static THREADFN thread_loop(void *ptr) {
-  VP9Worker *const worker = (VP9Worker*)ptr;
+  VPxWorker *const worker = (VPxWorker*)ptr;
   int done = 0;
   while (!done) {
     pthread_mutex_lock(&worker->impl_->mutex_);
@@ -52,8 +52,8 @@
 }
 
 // main thread state control
-static void change_state(VP9Worker *const worker,
-                         VP9WorkerStatus new_status) {
+static void change_state(VPxWorker *const worker,
+                         VPxWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
@@ -78,12 +78,12 @@
 
 //------------------------------------------------------------------------------
 
-static void init(VP9Worker *const worker) {
+static void init(VPxWorker *const worker) {
   memset(worker, 0, sizeof(*worker));
   worker->status_ = NOT_OK;
 }
 
-static int sync(VP9Worker *const worker) {
+static int sync(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   change_state(worker, OK);
 #endif
@@ -91,12 +91,12 @@
   return !worker->had_error;
 }
 
-static int reset(VP9Worker *const worker) {
+static int reset(VPxWorker *const worker) {
   int ok = 1;
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #if CONFIG_MULTITHREAD
-    worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+    worker->impl_ = (VPxWorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
     if (worker->impl_ == NULL) {
       return 0;
     }
@@ -129,13 +129,13 @@
   return ok;
 }
 
-static void execute(VP9Worker *const worker) {
+static void execute(VPxWorker *const worker) {
   if (worker->hook != NULL) {
     worker->had_error |= !worker->hook(worker->data1, worker->data2);
   }
 }
 
-static void launch(VP9Worker *const worker) {
+static void launch(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   change_state(worker, WORK);
 #else
@@ -143,7 +143,7 @@
 #endif
 }
 
-static void end(VP9Worker *const worker) {
+static void end(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   if (worker->impl_ != NULL) {
     change_state(worker, NOT_OK);
@@ -162,11 +162,11 @@
 
 //------------------------------------------------------------------------------
 
-static VP9WorkerInterface g_worker_interface = {
+static VPxWorkerInterface g_worker_interface = {
   init, reset, sync, launch, execute, end
 };
 
-int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
+int vpx_set_worker_interface(const VPxWorkerInterface* const winterface) {
   if (winterface == NULL ||
       winterface->init == NULL || winterface->reset == NULL ||
       winterface->sync == NULL || winterface->launch == NULL ||
@@ -177,7 +177,7 @@
   return 1;
 }
 
-const VP9WorkerInterface *vp9_get_worker_interface(void) {
+const VPxWorkerInterface *vpx_get_worker_interface(void) {
   return &g_worker_interface;
 }
 

diff --git a/libvpx/vp9/common/vp9_thread.h b/libvpx/vpx_util/vpx_thread.h
similarity index 86%
rename from libvpx/vp9/common/vp9_thread.h
rename to libvpx/vpx_util/vpx_thread.h
index 864579c..de63c4d 100644
--- a/libvpx/vp9/common/vp9_thread.h
+++ b/libvpx/vpx_util/vpx_thread.h

@@ -13,8 +13,8 @@
 //  http://git.chromium.org/webm/libwebp.git
 //  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h
 
-#ifndef VP9_DECODER_VP9_THREAD_H_
-#define VP9_DECODER_VP9_THREAD_H_
+#ifndef VPX_THREAD_H_
+#define VPX_THREAD_H_
 
 #include "./vpx_config.h"
 
@@ -22,9 +22,13 @@
 extern "C" {
 #endif
 
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
 #if CONFIG_MULTITHREAD
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #include <errno.h>  // NOLINT
 #include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT
@@ -103,8 +107,8 @@
 static INLINE int pthread_cond_init(pthread_cond_t *const condition,
                                     void* cond_attr) {
   (void)cond_attr;
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
   condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
   if (condition->waiting_sem_ == NULL ||
       condition->received_sem_ == NULL ||
@@ -156,59 +160,59 @@
   NOT_OK = 0,   // object is unusable
   OK,           // ready to work
   WORK          // busy finishing the current task
-} VP9WorkerStatus;
+} VPxWorkerStatus;
 
 // Function to be called by the worker thread. Takes two opaque pointers as
 // arguments (data1 and data2), and should return false in case of error.
-typedef int (*VP9WorkerHook)(void*, void*);
+typedef int (*VPxWorkerHook)(void*, void*);
 
 // Platform-dependent implementation details for the worker.
-typedef struct VP9WorkerImpl VP9WorkerImpl;
+typedef struct VPxWorkerImpl VPxWorkerImpl;
 
 // Synchronization object used to launch job in the worker thread
 typedef struct {
-  VP9WorkerImpl *impl_;
-  VP9WorkerStatus status_;
-  VP9WorkerHook hook;     // hook to call
+  VPxWorkerImpl *impl_;
+  VPxWorkerStatus status_;
+  VPxWorkerHook hook;     // hook to call
   void *data1;            // first argument passed to 'hook'
   void *data2;            // second argument passed to 'hook'
   int had_error;          // return value of the last call to 'hook'
-} VP9Worker;
+} VPxWorker;
 
 // The interface for all thread-worker related functions. All these functions
 // must be implemented.
 typedef struct {
   // Must be called first, before any other method.
-  void (*init)(VP9Worker *const worker);
+  void (*init)(VPxWorker *const worker);
   // Must be called to initialize the object and spawn the thread. Re-entrant.
   // Will potentially launch the thread. Returns false in case of error.
-  int (*reset)(VP9Worker *const worker);
+  int (*reset)(VPxWorker *const worker);
   // Makes sure the previous work is finished. Returns true if worker->had_error
   // was not set and no error condition was triggered by the working thread.
-  int (*sync)(VP9Worker *const worker);
+  int (*sync)(VPxWorker *const worker);
   // Triggers the thread to call hook() with data1 and data2 arguments. These
   // hook/data1/data2 values can be changed at any time before calling this
   // function, but not be changed afterward until the next call to Sync().
-  void (*launch)(VP9Worker *const worker);
+  void (*launch)(VPxWorker *const worker);
   // This function is similar to launch() except that it calls the
   // hook directly instead of using a thread. Convenient to bypass the thread
-  // mechanism while still using the VP9Worker structs. sync() must
+  // mechanism while still using the VPxWorker structs. sync() must
   // still be called afterward (for error reporting).
-  void (*execute)(VP9Worker *const worker);
+  void (*execute)(VPxWorker *const worker);
   // Kill the thread and terminate the object. To use the object again, one
   // must call reset() again.
-  void (*end)(VP9Worker *const worker);
-} VP9WorkerInterface;
+  void (*end)(VPxWorker *const worker);
+} VPxWorkerInterface;
 
 // Install a new set of threading functions, overriding the defaults. This
 // should be done before any workers are started, i.e., before any encoding or
 // decoding takes place. The contents of the interface struct are copied, it
 // is safe to free the corresponding memory after this call. This function is
 // not thread-safe. Return false in case of invalid pointer or methods.
-int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
+int vpx_set_worker_interface(const VPxWorkerInterface *const winterface);
 
 // Retrieve the currently set thread worker interface.
-const VP9WorkerInterface *vp9_get_worker_interface(void);
+const VPxWorkerInterface *vpx_get_worker_interface(void);
 
 //------------------------------------------------------------------------------
 
@@ -216,4 +220,4 @@
 }    // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_THREAD_H_
+#endif  // VPX_THREAD_H_

diff --git a/libvpx/vpx_util/vpx_util.mk b/libvpx/vpx_util/vpx_util.mk
new file mode 100644
index 0000000..c0ef8d3
--- /dev/null
+++ b/libvpx/vpx_util/vpx_util.mk

@@ -0,0 +1,14 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_thread.c
+UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h

diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index faee42a..3c61bd9 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c

@@ -15,18 +15,20 @@
 #include <string.h>
 #include <limits.h>
 
+#include "./vpx_config.h"
+
+#if CONFIG_LIBYUV
 #include "third_party/libyuv/include/libyuv/scale.h"
+#endif
 
 #include "./args.h"
 #include "./ivfdec.h"
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
 
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -45,76 +47,84 @@
   struct WebmInputContext *webm_ctx;
 };
 
-static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1,
-                                          "Number of times to decode the file");
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
-                                          "Codec to use");
-static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
-                                          "Output raw YV12 frames");
-static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
-                                          "Output raw I420 frames");
-static const arg_def_t flipuvarg = ARG_DEF(NULL, "flipuv", 0,
-                                           "Flip the chroma planes in the output");
-static const arg_def_t rawvideo = ARG_DEF(NULL, "rawvideo", 0,
-                                          "Output raw YUV frames");
-static const arg_def_t noblitarg = ARG_DEF(NULL, "noblit", 0,
-                                           "Don't process the decoded frames");
-static const arg_def_t progressarg = ARG_DEF(NULL, "progress", 0,
-                                             "Show progress after each frame decodes");
-static const arg_def_t limitarg = ARG_DEF(NULL, "limit", 1,
-                                          "Stop decoding after n frames");
-static const arg_def_t skiparg = ARG_DEF(NULL, "skip", 1,
-                                         "Skip the first n input frames");
-static const arg_def_t postprocarg = ARG_DEF(NULL, "postproc", 0,
-                                             "Postprocess decoded frames");
-static const arg_def_t summaryarg = ARG_DEF(NULL, "summary", 0,
-                                            "Show timing summary");
-static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
-                                            "Output file name pattern (see below)");
-static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1,
-                                            "Max threads to use");
-static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0,
-                                            "Show version string");
-static const arg_def_t error_concealment = ARG_DEF(NULL, "error-concealment", 0,
-                                                   "Enable decoder error-concealment");
-static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
-                                            "Scale output frames uniformly");
-static const arg_def_t continuearg =
-    ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
-
-static const arg_def_t fb_arg =
-    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
-
-static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
-                                        "Compute the MD5 sum of the decoded frame");
+static const arg_def_t looparg = ARG_DEF(
+    NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg = ARG_DEF(
+    NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo = ARG_DEF(
+    NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg = ARG_DEF(
+    NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg = ARG_DEF(
+    NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg = ARG_DEF(
+    NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t postprocarg = ARG_DEF(
+    NULL, "postproc", 0, "Postprocess decoded frames");
+static const arg_def_t summaryarg = ARG_DEF(
+    NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg = ARG_DEF(
+    "t", "threads", 1, "Max threads to use");
+static const arg_def_t frameparallelarg = ARG_DEF(
+    NULL, "frame-parallel", 0, "Frame parallel decode");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show version string");
+static const arg_def_t error_concealment = ARG_DEF(
+    NULL, "error-concealment", 0, "Enable decoder error-concealment");
+static const arg_def_t scalearg = ARG_DEF(
+    "S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg = ARG_DEF(
+    "k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg = ARG_DEF(
+    NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg = ARG_DEF(
+    NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+#if CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t outbitdeptharg = ARG_DEF(
+    NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+#endif
 
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg, &scalearg, &fb_arg,
+  &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
   &md5arg, &error_concealment, &continuearg,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &outbitdeptharg,
+#endif
   NULL
 };
 
 #if CONFIG_VP8_DECODER
-static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
-                                                "Enable VP8 postproc add noise");
-static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
-                                         "Enable VP8 deblocking");
-static const arg_def_t demacroblock_level = ARG_DEF(NULL, "demacroblock-level", 1,
-                                                    "Enable VP8 demacroblocking, w/ level");
-static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
-                                               "Enable VP8 visible debug info");
-static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
-                                                   "Display only selected reference frame per macro block");
-static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
-                                                  "Display only selected macro block modes");
-static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
-                                                 "Display only selected block modes");
-static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
-                                             "Draw only selected motion vectors");
-static const arg_def_t mfqe = ARG_DEF(NULL, "mfqe", 0,
-                                      "Enable multiframe quality enhancement");
+static const arg_def_t addnoise_level = ARG_DEF(
+    NULL, "noise-level", 1, "Enable VP8 postproc add noise");
+static const arg_def_t deblock = ARG_DEF(
+    NULL, "deblock", 0, "Enable VP8 deblocking");
+static const arg_def_t demacroblock_level = ARG_DEF(
+    NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level");
+static const arg_def_t pp_debug_info = ARG_DEF(
+    NULL, "pp-debug-info", 1, "Enable VP8 visible debug info");
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(
+    NULL, "pp-dbg-ref-frame", 1,
+    "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(
+    NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(
+    NULL, "pp-dbg-b-modes", 1, "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(
+    NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors");
+static const arg_def_t mfqe = ARG_DEF(
+    NULL, "mfqe", 0, "Enable multiframe quality enhancement");
 
 static const arg_def_t *vp8_pp_args[] = {
   &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
@@ -123,8 +133,29 @@
 };
 #endif
 
-static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+#if CONFIG_LIBYUV
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                                   FilterModeEnum mode) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->fmt == VPX_IMG_FMT_I42016) {
+    assert(dst->fmt == VPX_IMG_FMT_I42016);
+    return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
+                        src->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_U],
+                        src->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_V],
+                        src->stride[VPX_PLANE_V]/2,
+                        src->d_w, src->d_h,
+                        (uint16_t*)dst->planes[VPX_PLANE_Y],
+                        dst->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_U],
+                        dst->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_V],
+                        dst->stride[VPX_PLANE_V]/2,
+                        dst->d_w, dst->d_h,
+                        mode);
+  }
+#endif
   assert(src->fmt == VPX_IMG_FMT_I420);
   assert(dst->fmt == VPX_IMG_FMT_I420);
   return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
@@ -137,8 +168,9 @@
                    dst->d_w, dst->d_h,
                    mode);
 }
+#endif
 
-void usage_exit() {
+void usage_exit(void) {
   int i;
 
   fprintf(stderr, "Usage: %s <options> filename\n\n"
@@ -247,7 +279,8 @@
     const int plane = planes[i];
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+                ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {
@@ -260,6 +293,11 @@
 static void write_image_file(const vpx_image_t *img, const int planes[3],
                              FILE *file) {
   int i, y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+#else
+  const int bytes_per_sample = 1;
+#endif
 
   for (i = 0; i < 3; ++i) {
     const int plane = planes[i];
@@ -269,13 +307,13 @@
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytes_per_sample, w, file);
       buf += stride;
     }
   }
 }
 
-int file_is_raw(struct VpxInputContext *input) {
+static int file_is_raw(struct VpxInputContext *input) {
   uint8_t buf[32];
   int is_raw = 0;
   vpx_codec_stream_info_t si;
@@ -306,7 +344,7 @@
   return is_raw;
 }
 
-void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
   fprintf(stderr,
           "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
@@ -328,8 +366,8 @@
 // Application private data passed into the set function. |min_size| is the
 // minimum size in bytes needed to decode the next frame. |fb| pointer to the
 // frame buffer.
-int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
-                         vpx_codec_frame_buffer_t *fb) {
+static int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
+                                vpx_codec_frame_buffer_t *fb) {
   int i;
   struct ExternalFrameBufferList *const ext_fb_list =
       (struct ExternalFrameBufferList *)cb_priv;
@@ -347,7 +385,7 @@
 
   if (ext_fb_list->ext_fb[i].size < min_size) {
     free(ext_fb_list->ext_fb[i].data);
-    ext_fb_list->ext_fb[i].data = (uint8_t *)malloc(min_size);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
     if (!ext_fb_list->ext_fb[i].data)
       return -1;
 
@@ -366,8 +404,8 @@
 // Callback used by libvpx when there are no references to the frame buffer.
 // |cb_priv| user private data passed into the set function. |fb| pointer
 // to the frame buffer.
-int release_vp9_frame_buffer(void *cb_priv,
-                             vpx_codec_frame_buffer_t *fb) {
+static int release_vp9_frame_buffer(void *cb_priv,
+                                    vpx_codec_frame_buffer_t *fb) {
   struct ExternalFrameBuffer *const ext_fb =
       (struct ExternalFrameBuffer *)fb->priv;
   (void)cb_priv;
@@ -375,9 +413,9 @@
   return 0;
 }
 
-void generate_filename(const char *pattern, char *out, size_t q_len,
-                       unsigned int d_w, unsigned int d_h,
-                       unsigned int frame_in) {
+static void generate_filename(const char *pattern, char *out, size_t q_len,
+                              unsigned int d_w, unsigned int d_h,
+                              unsigned int frame_in) {
   const char *p = pattern;
   char *q = out;
 
@@ -484,12 +522,22 @@
   } else {
     FILE *file = fopen(name, "wb");
     if (!file)
-      fatal("Failed to output file %s", name);
+      fatal("Failed to open output file '%s'", name);
     return file;
   }
 }
 
-int main_loop(int argc, const char **argv_) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static int img_shifted_realloc_required(const vpx_image_t *img,
+                                        const vpx_image_t *shifted,
+                                        vpx_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w ||
+         img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
+}
+#endif
+
+static int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
   int                    i;
@@ -497,7 +545,7 @@
   size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
   int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int                    do_md5 = 0, progress = 0;
+  int                    do_md5 = 0, progress = 0, frame_parallel = 0;
   int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int                    arg_skip = 0;
   int                    ec_enabled = 0;
@@ -513,6 +561,9 @@
   int                     opt_yv12 = 0;
   int                     opt_i420 = 0;
   vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
+#if CONFIG_VP9_HIGHBITDEPTH
+  int                     output_bit_depth = 0;
+#endif
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
   int                     vp8_dbg_color_ref_frame = 0;
@@ -524,7 +575,10 @@
   int                     dec_flags = 0;
   int                     do_scale = 0;
   vpx_image_t             *scaled_img = NULL;
-  int                     frame_avail, got_data;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t             *img_shifted = NULL;
+#endif
+  int                     frame_avail, got_data, flush_decoder = 0;
   int                     num_external_frame_buffers = 0;
   struct ExternalFrameBufferList ext_fb_list = {0, NULL};
 
@@ -538,7 +592,8 @@
   struct VpxDecInputContext input = {NULL, NULL};
   struct VpxInputContext vpx_input_ctx;
 #if CONFIG_WEBM_IO
-  struct WebmInputContext webm_ctx = {0};
+  struct WebmInputContext webm_ctx;
+  memset(&(webm_ctx), 0, sizeof(webm_ctx));
   input.webm_ctx = &webm_ctx;
 #endif
   input.vpx_input_ctx = &vpx_input_ctx;
@@ -563,6 +618,9 @@
       use_y4m = 0;
       flipuv = 1;
       opt_yv12 = 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+      output_bit_depth = 8;  // For yv12 8-bit depth output is assumed
+#endif
     } else if (arg_match(&arg, &use_i420, argi)) {
       use_y4m = 0;
       flipuv = 0;
@@ -587,13 +645,23 @@
       summary = 1;
     else if (arg_match(&arg, &threadsarg, argi))
       cfg.threads = arg_parse_uint(&arg);
+#if CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+    else if (arg_match(&arg, &frameparallelarg, argi))
+      frame_parallel = 1;
+#endif
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
     else if (arg_match(&arg, &scalearg, argi))
       do_scale = 1;
     else if (arg_match(&arg, &fb_arg, argi))
       num_external_frame_buffers = arg_parse_uint(&arg);
-
+    else if (arg_match(&arg, &continuearg, argi))
+      keep_going = 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    else if (arg_match(&arg, &outbitdeptharg, argi)) {
+      output_bit_depth = arg_parse_uint(&arg);
+    }
+#endif
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -643,11 +711,8 @@
       }
     } else if (arg_match(&arg, &error_concealment, argi)) {
       ec_enabled = 1;
-    } else if (arg_match(&arg, &continuearg, argi)) {
-      keep_going = 1;
     }
-
-#endif
+#endif  // CONFIG_VP8_DECODER
     else
       argj++;
   }
@@ -660,15 +725,15 @@
   /* Handle non-option arguments */
   fn = argv[0];
 
-  if (!fn)
+  if (!fn) {
+    free(argv);
     usage_exit();
-
+  }
   /* Open file */
   infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
 
   if (!infile) {
-    fprintf(stderr, "Failed to open file '%s'", strcmp(fn, "-") ? fn : "stdin");
-    return EXIT_FAILURE;
+    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
   }
 #if CONFIG_OS_SUPPORT
   /* Make sure we don't dump to the terminal, unless forced to with -o - */
@@ -711,7 +776,7 @@
   if (use_y4m && !noblit) {
     if (!single_file) {
       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
-              " try --i420 or --yv12.\n");
+              " try --i420 or --yv12 or --rawvideo.\n");
       return EXIT_FAILURE;
     }
 
@@ -736,7 +801,8 @@
     interface = get_vpx_decoder_by_index(0);
 
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
-              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
+              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
+              (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
   if (vpx_codec_dec_init(&decoder, interface->codec_interface(),
                          &cfg, dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -748,34 +814,42 @@
     fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER
-
   if (vp8_pp_cfg.post_proc_flag
       && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
-    fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder));
+    fprintf(stderr, "Failed to configure postproc: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
   if (vp8_dbg_color_ref_frame
-      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame)) {
-    fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME,
+                           vp8_dbg_color_ref_frame)) {
+    fprintf(stderr, "Failed to configure reference block visualizer: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
   if (vp8_dbg_color_mb_modes
-      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes)) {
-    fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES,
+                           vp8_dbg_color_mb_modes)) {
+    fprintf(stderr, "Failed to configure macro block visualizer: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
   if (vp8_dbg_color_b_modes
-      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes)) {
-    fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES,
+                           vp8_dbg_color_b_modes)) {
+    fprintf(stderr, "Failed to configure block visualizer: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
   if (vp8_dbg_display_mv
-      && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv)) {
-    fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
+      && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV,
+                           vp8_dbg_display_mv)) {
+    fprintf(stderr, "Failed to configure motion vector visualizer: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 #endif
@@ -810,7 +884,7 @@
     vpx_codec_iter_t  iter = NULL;
     vpx_image_t    *img;
     struct vpx_usec_timer timer;
-    int                   corrupted;
+    int                   corrupted = 0;
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
@@ -834,11 +908,22 @@
 
         vpx_usec_timer_mark(&timer);
         dx_time += vpx_usec_timer_elapsed(&timer);
+      } else {
+        flush_decoder = 1;
       }
+    } else {
+      flush_decoder = 1;
     }
 
     vpx_usec_timer_start(&timer);
 
+    if (flush_decoder) {
+      // Flush the decoder in frame parallel decode.
+      if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
+        warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+      }
+    }
+
     got_data = 0;
     if ((img = vpx_codec_get_frame(&decoder, &iter))) {
       ++frame_out;
@@ -848,9 +933,11 @@
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
+    if (!frame_parallel &&
+        vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
-      goto fail;
+      if (!keep_going)
+        goto fail;
     }
     frames_corrupted += corrupted;
 
@@ -883,14 +970,14 @@
               display_height = display_size[1];
             }
           }
-          scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, display_width,
+          scaled_img = vpx_img_alloc(NULL, img->fmt, display_width,
                                      display_height, 16);
           scaled_img->bit_depth = img->bit_depth;
         }
 
         if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
 #if CONFIG_LIBYUV
-          vpx_image_scale(img, scaled_img, kFilterBox);
+          libyuv_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
 #else
           fprintf(stderr, "Failed  to scale output frame: %s.\n"
@@ -901,11 +988,45 @@
 #endif
         }
       }
+#if CONFIG_VP9_HIGHBITDEPTH
+      // Default to codec bit depth if output bit depth not set
+      if (!output_bit_depth) {
+        output_bit_depth = img->bit_depth;
+      }
+      // Shift up or down if necessary
+      if (output_bit_depth != img->bit_depth) {
+        const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
+            img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
+            img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+        if (img_shifted &&
+            img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+          vpx_img_free(img_shifted);
+          img_shifted = NULL;
+        }
+        if (!img_shifted) {
+          img_shifted = vpx_img_alloc(NULL, shifted_fmt,
+                                      img->d_w, img->d_h, 16);
+          img_shifted->bit_depth = output_bit_depth;
+        }
+        if (output_bit_depth > img->bit_depth) {
+          vpx_img_upshift(img_shifted, img,
+                          output_bit_depth - img->bit_depth);
+        } else {
+          vpx_img_downshift(img_shifted, img,
+                            img->bit_depth - output_bit_depth);
+        }
+        img = img_shifted;
+      }
+#endif
 
       if (single_file) {
         if (use_y4m) {
           char buf[Y4M_BUFFER_SIZE] = {0};
           size_t len = 0;
+          if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
+            fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
+            goto fail;
+          }
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(buf, sizeof(buf),
@@ -968,9 +1089,6 @@
         }
       }
     }
-
-    if (stop_after && frame_in >= stop_after)
-      break;
   }
 
   if (summary || progress) {
@@ -1007,6 +1125,9 @@
     free(buf);
 
   if (scaled_img) vpx_img_free(scaled_img);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img_shifted) vpx_img_free(img_shifted);
+#endif
 
   for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
     free(ext_fb_list.ext_fb[i].data);

diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 7e037a6..06604ea 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c

@@ -19,20 +19,23 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
 #include "vpx/vpx_encoder.h"
 #if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
 #endif
 
-#include "third_party/libyuv/include/libyuv/scale.h"
 #include "./args.h"
 #include "./ivfenc.h"
 #include "./tools_common.h"
 
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -96,7 +99,7 @@
   va_end(ap);
 }
 
-int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
   FILE *f = input_ctx->file;
   y4m_input *y4m = &input_ctx->y4m;
   int shortread = 0;
@@ -111,62 +114,64 @@
   return !shortread;
 }
 
-int file_is_y4m(const char detect[4]) {
+static int file_is_y4m(const char detect[4]) {
   if (memcmp(detect, "YUV4", 4) == 0) {
     return 1;
   }
   return 0;
 }
 
-int fourcc_is_ivf(const char detect[4]) {
+static int fourcc_is_ivf(const char detect[4]) {
   if (memcmp(detect, "DKIF", 4) == 0) {
     return 1;
   }
   return 0;
 }
 
-static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
-                                           "Debug mode (makes output deterministic)");
-static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
-                                            "Output filename");
-static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
-                                          "Input file is YV12 ");
-static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0,
-                                          "Input file is I420 (default)");
-static const arg_def_t use_i422 = ARG_DEF(NULL, "i422", 0,
-                                          "Input file is I422");
-static const arg_def_t use_i444 = ARG_DEF(NULL, "i444", 0,
-                                          "Input file is I444");
-static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
-                                          "Codec to use");
-static const arg_def_t passes           = ARG_DEF("p", "passes", 1,
-                                                  "Number of passes (1/2)");
-static const arg_def_t pass_arg         = ARG_DEF(NULL, "pass", 1,
-                                                  "Pass to execute (1/2)");
-static const arg_def_t fpf_name         = ARG_DEF(NULL, "fpf", 1,
-                                                  "First pass statistics file name");
+static const arg_def_t debugmode = ARG_DEF(
+    "D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 = ARG_DEF(
+    NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 = ARG_DEF(
+    NULL, "i444", 0, "Input file is I444");
+static const arg_def_t use_i440 = ARG_DEF(
+    NULL, "i440", 0, "Input file is I440");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes = ARG_DEF(
+    "p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg = ARG_DEF(
+    NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name = ARG_DEF(
+    NULL, "fpf", 1, "First pass statistics file name");
 #if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name         = ARG_DEF(NULL, "fpmbf", 1,
-                                      "First pass block statistics file name");
+static const arg_def_t fpmbf_name = ARG_DEF(
+    NULL, "fpmbf", 1, "First pass block statistics file name");
 #endif
-static const arg_def_t limit = ARG_DEF(NULL, "limit", 1,
-                                       "Stop encoding after n input frames");
-static const arg_def_t skip = ARG_DEF(NULL, "skip", 1,
-                                      "Skip the first n input frames");
-static const arg_def_t deadline         = ARG_DEF("d", "deadline", 1,
-                                                  "Deadline per frame (usec)");
-static const arg_def_t best_dl          = ARG_DEF(NULL, "best", 0,
-                                                  "Use Best Quality Deadline");
-static const arg_def_t good_dl          = ARG_DEF(NULL, "good", 0,
-                                                  "Use Good Quality Deadline");
-static const arg_def_t rt_dl            = ARG_DEF(NULL, "rt", 0,
-                                                  "Use Realtime Quality Deadline");
-static const arg_def_t quietarg         = ARG_DEF("q", "quiet", 0,
-                                                  "Do not print encode progress");
-static const arg_def_t verbosearg       = ARG_DEF("v", "verbose", 0,
-                                                  "Show encoder parameters");
-static const arg_def_t psnrarg          = ARG_DEF(NULL, "psnr", 0,
-                                                  "Show PSNR in status line");
+static const arg_def_t limit = ARG_DEF(
+    NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t deadline = ARG_DEF(
+    "d", "deadline", 1, "Deadline per frame (usec)");
+static const arg_def_t best_dl = ARG_DEF(
+    NULL, "best", 0, "Use Best Quality Deadline");
+static const arg_def_t good_dl = ARG_DEF(
+    NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl = ARG_DEF(
+    NULL, "rt", 0, "Use Realtime Quality Deadline");
+static const arg_def_t quietarg = ARG_DEF(
+    "q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg = ARG_DEF(
+    NULL, "psnr", 0, "Show PSNR in status line");
 
 static const struct arg_enum_list test_decode_enum[] = {
   {"off",   TEST_DECODE_OFF},
@@ -174,49 +179,50 @@
   {"warn",  TEST_DECODE_WARN},
   {NULL, 0}
 };
-static const arg_def_t recontest = ARG_DEF_ENUM(NULL, "test-decode", 1,
-                                                "Test encode/decode mismatch",
-                                                test_decode_enum);
-static const arg_def_t framerate        = ARG_DEF(NULL, "fps", 1,
-                                                  "Stream frame rate (rate/scale)");
-static const arg_def_t use_ivf          = ARG_DEF(NULL, "ivf", 0,
-                                                  "Output IVF (default is WebM if WebM IO is enabled)");
-static const arg_def_t out_part = ARG_DEF("P", "output-partitions", 0,
-                                          "Makes encoder output partitions. Requires IVF output!");
-static const arg_def_t q_hist_n         = ARG_DEF(NULL, "q-hist", 1,
-                                                  "Show quantizer histogram (n-buckets)");
-static const arg_def_t rate_hist_n         = ARG_DEF(NULL, "rate-hist", 1,
-                                                     "Show rate histogram (n-buckets)");
-static const arg_def_t disable_warnings =
-    ARG_DEF(NULL, "disable-warnings", 0,
-            "Disable warnings about potentially incorrect encode settings.");
-static const arg_def_t disable_warning_prompt =
-    ARG_DEF("y", "disable-warning-prompt", 0,
-            "Display warnings, but do not prompt user to continue.");
-static const arg_def_t experimental_bitstream =
-    ARG_DEF(NULL, "experimental-bitstream", 0,
-            "Allow experimental bitstream features.");
+static const arg_def_t recontest = ARG_DEF_ENUM(
+    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
+static const arg_def_t framerate = ARG_DEF(
+    NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm = ARG_DEF(
+    NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
+static const arg_def_t use_ivf = ARG_DEF(
+    NULL, "ivf", 0, "Output IVF");
+static const arg_def_t out_part = ARG_DEF(
+    "P", "output-partitions", 0,
+    "Makes encoder output partitions. Requires IVF output!");
+static const arg_def_t q_hist_n = ARG_DEF(
+    NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n = ARG_DEF(
+    NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings = ARG_DEF(
+    NULL, "disable-warnings", 0,
+    "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt = ARG_DEF(
+    "y", "disable-warning-prompt", 0,
+    "Display warnings, but do not prompt user to continue.");
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t test16bitinternalarg = ARG_DEF(
+    NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
+#endif
 
 static const arg_def_t *main_args[] = {
   &debugmode,
   &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
   &deadline, &best_dl, &good_dl, &rt_dl,
-  &quietarg, &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n,
-  &rate_hist_n, &disable_warnings, &disable_warning_prompt,
+  &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n,
+  &rate_hist_n, &disable_warnings, &disable_warning_prompt, &recontest,
   NULL
 };
 
-static const arg_def_t usage            = ARG_DEF("u", "usage", 1,
-                                                  "Usage profile number to use");
-static const arg_def_t threads          = ARG_DEF("t", "threads", 1,
-                                                  "Max number of threads to use");
-static const arg_def_t profile          = ARG_DEF(NULL, "profile", 1,
-                                                  "Bitstream profile number to use");
-static const arg_def_t width            = ARG_DEF("w", "width", 1,
-                                                  "Frame width");
-static const arg_def_t height           = ARG_DEF("h", "height", 1,
-                                                  "Frame height");
+static const arg_def_t usage = ARG_DEF(
+    "u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads = ARG_DEF(
+    "t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile = ARG_DEF(
+    NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
+static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
 #if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
   {"mono", STEREO_FORMAT_MONO},
@@ -226,18 +232,18 @@
   {"right-left", STEREO_FORMAT_RIGHT_LEFT},
   {NULL, 0}
 };
-static const arg_def_t stereo_mode      = ARG_DEF_ENUM(NULL, "stereo-mode", 1,
-                                                       "Stereo 3D video format", stereo_mode_enum);
+static const arg_def_t stereo_mode = ARG_DEF_ENUM(
+    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
 #endif
-static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,
-                                                  "Output timestamp precision (fractional seconds)");
-static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,
-                                                  "Enable error resiliency features");
-static const arg_def_t lag_in_frames    = ARG_DEF(NULL, "lag-in-frames", 1,
-                                                  "Max number of frames to lag");
+static const arg_def_t timebase = ARG_DEF(
+    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
+static const arg_def_t error_resilient = ARG_DEF(
+    NULL, "error-resilient", 1, "Enable error resiliency features");
+static const arg_def_t lag_in_frames = ARG_DEF(
+    NULL, "lag-in-frames", 1, "Max number of frames to lag");
 
 static const arg_def_t *global_args[] = {
-  &use_yv12, &use_i420, &use_i422, &use_i444,
+  &use_yv12, &use_i420, &use_i422, &use_i444, &use_i440,
   &usage, &threads, &profile,
   &width, &height,
 #if CONFIG_WEBM_IO
@@ -245,21 +251,24 @@
 #endif
   &timebase, &framerate,
   &error_resilient,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &test16bitinternalarg,
+#endif
   &lag_in_frames, NULL
 };
 
-static const arg_def_t dropframe_thresh   = ARG_DEF(NULL, "drop-frame", 1,
-                                                    "Temporal resampling threshold (buf %)");
-static const arg_def_t resize_allowed     = ARG_DEF(NULL, "resize-allowed", 1,
-                                                    "Spatial resampling enabled (bool)");
-static const arg_def_t resize_width       = ARG_DEF(NULL, "resize-width", 1,
-                                                    "Width of encoded frame");
-static const arg_def_t resize_height      = ARG_DEF(NULL, "resize-height", 1,
-                                                    "Height of encoded frame");
-static const arg_def_t resize_up_thresh   = ARG_DEF(NULL, "resize-up", 1,
-                                                    "Upscale threshold (buf %)");
-static const arg_def_t resize_down_thresh = ARG_DEF(NULL, "resize-down", 1,
-                                                    "Downscale threshold (buf %)");
+static const arg_def_t dropframe_thresh = ARG_DEF(
+    NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_allowed = ARG_DEF(
+    NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)");
+static const arg_def_t resize_width = ARG_DEF(
+    NULL, "resize-width", 1, "Width of encoded frame");
+static const arg_def_t resize_height = ARG_DEF(
+    NULL, "resize-height", 1, "Height of encoded frame");
+static const arg_def_t resize_up_thresh = ARG_DEF(
+    NULL, "resize-up", 1, "Upscale threshold (buf %)");
+static const arg_def_t resize_down_thresh = ARG_DEF(
+    NULL, "resize-down", 1, "Downscale threshold (buf %)");
 static const struct arg_enum_list end_usage_enum[] = {
   {"vbr", VPX_VBR},
   {"cbr", VPX_CBR},
@@ -267,24 +276,24 @@
   {"q",   VPX_Q},
   {NULL, 0}
 };
-static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
-                                                         "Rate control mode", end_usage_enum);
-static const arg_def_t target_bitrate     = ARG_DEF(NULL, "target-bitrate", 1,
-                                                    "Bitrate (kbps)");
-static const arg_def_t min_quantizer      = ARG_DEF(NULL, "min-q", 1,
-                                                    "Minimum (best) quantizer");
-static const arg_def_t max_quantizer      = ARG_DEF(NULL, "max-q", 1,
-                                                    "Maximum (worst) quantizer");
-static const arg_def_t undershoot_pct     = ARG_DEF(NULL, "undershoot-pct", 1,
-                                                    "Datarate undershoot (min) target (%)");
-static const arg_def_t overshoot_pct      = ARG_DEF(NULL, "overshoot-pct", 1,
-                                                    "Datarate overshoot (max) target (%)");
-static const arg_def_t buf_sz             = ARG_DEF(NULL, "buf-sz", 1,
-                                                    "Client buffer size (ms)");
-static const arg_def_t buf_initial_sz     = ARG_DEF(NULL, "buf-initial-sz", 1,
-                                                    "Client initial buffer size (ms)");
-static const arg_def_t buf_optimal_sz     = ARG_DEF(NULL, "buf-optimal-sz", 1,
-                                                    "Client optimal buffer size (ms)");
+static const arg_def_t end_usage = ARG_DEF_ENUM(
+    NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate = ARG_DEF(
+    NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer = ARG_DEF(
+    NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer = ARG_DEF(
+    NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct = ARG_DEF(
+    NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct = ARG_DEF(
+    NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz = ARG_DEF(
+    NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz = ARG_DEF(
+    NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz = ARG_DEF(
+    NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
 static const arg_def_t *rc_args[] = {
   &dropframe_thresh, &resize_allowed, &resize_width, &resize_height,
   &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate,
@@ -293,63 +302,65 @@
 };
 
 
-static const arg_def_t bias_pct = ARG_DEF(NULL, "bias-pct", 1,
-                                          "CBR/VBR bias (0=CBR, 100=VBR)");
-static const arg_def_t minsection_pct = ARG_DEF(NULL, "minsection-pct", 1,
-                                                "GOP min bitrate (% of target)");
-static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1,
-                                                "GOP max bitrate (% of target)");
+static const arg_def_t bias_pct = ARG_DEF(
+    NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct = ARG_DEF(
+    NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct = ARG_DEF(
+    NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
 static const arg_def_t *rc_twopass_args[] = {
   &bias_pct, &minsection_pct, &maxsection_pct, NULL
 };
 
 
-static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1,
-                                             "Minimum keyframe interval (frames)");
-static const arg_def_t kf_max_dist = ARG_DEF(NULL, "kf-max-dist", 1,
-                                             "Maximum keyframe interval (frames)");
-static const arg_def_t kf_disabled = ARG_DEF(NULL, "disable-kf", 0,
-                                             "Disable keyframe placement");
+static const arg_def_t kf_min_dist = ARG_DEF(
+    NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist = ARG_DEF(
+    NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled = ARG_DEF(
+    NULL, "disable-kf", 0, "Disable keyframe placement");
 static const arg_def_t *kf_args[] = {
   &kf_min_dist, &kf_max_dist, &kf_disabled, NULL
 };
 
 
-static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
-                                            "Noise sensitivity (frames to blur)");
-static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
-                                           "Filter sharpness (0-7)");
-static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
-                                               "Motion detection threshold");
-static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
-                                          "CPU Used (-16..16)");
-static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
-                                             "Enable automatic alt reference frames");
-static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                                "AltRef Max Frames");
-static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                               "AltRef Strength");
-static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                           "AltRef Type");
+static const arg_def_t noise_sens = ARG_DEF(
+    NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness = ARG_DEF(
+    NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh = ARG_DEF(
+    NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t auto_altref = ARG_DEF(
+    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes = ARG_DEF(
+    NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength = ARG_DEF(
+    NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const arg_def_t arnr_type = ARG_DEF(
+    NULL, "arnr-type", 1, "AltRef type");
 static const struct arg_enum_list tuning_enum[] = {
   {"psnr", VP8_TUNE_PSNR},
   {"ssim", VP8_TUNE_SSIM},
   {NULL, 0}
 };
-static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
-                                                "Material to favor", tuning_enum);
-static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
-                                          "Constant/Constrained Quality level");
-static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
-                                                    "Max I-frame bitrate (pct)");
+static const arg_def_t tune_ssim = ARG_DEF_ENUM(
+    NULL, "tune", 1, "Material to favor", tuning_enum);
+static const arg_def_t cq_level = ARG_DEF(
+    NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct = ARG_DEF(
+    NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
 
 #if CONFIG_VP8_ENCODER
-static const arg_def_t token_parts =
-    ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
+static const arg_def_t cpu_used_vp8 = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t token_parts = ARG_DEF(
+    NULL, "token-parts", 1, "Number of token partitions to use, log2");
+static const arg_def_t screen_content_mode = ARG_DEF(
+    NULL, "screen-content-mode", 1, "Screen content mode");
 static const arg_def_t *vp8_args[] = {
-  &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
+  &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &screen_content_mode,
   NULL
 };
 static const int vp8_arg_ctrl_map[] = {
@@ -358,16 +369,20 @@
   VP8E_SET_TOKEN_PARTITIONS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP8E_SET_SCREEN_CONTENT_MODE,
   0
 };
 #endif
 
-#if CONFIG_VP9_ENCODER
-static const arg_def_t tile_cols =
-    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
-static const arg_def_t tile_rows =
-    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
-static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+static const arg_def_t cpu_used_vp9 = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-8..8)");
+static const arg_def_t tile_cols = ARG_DEF(
+    NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows = ARG_DEF(
+    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(
+    NULL, "lossless", 1, "Lossless mode");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
     NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
 static const arg_def_t aq_mode = ARG_DEF(
@@ -375,8 +390,50 @@
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
     "3: cyclic refresh)");
 static const arg_def_t frame_periodic_boost = ARG_DEF(
-    NULL, "frame_boost", 1,
+    NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+static const arg_def_t max_inter_rate_pct = ARG_DEF(
+    NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t min_gf_interval = ARG_DEF(
+    NULL, "min-gf-interval", 1,
+    "min gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t max_gf_interval = ARG_DEF(
+    NULL, "max-gf-interval", 1,
+    "max gf/arf frame interval (default 0, indicating in-built behavior)");
+
+static const struct arg_enum_list color_space_enum[] = {
+  { "unknown", VPX_CS_UNKNOWN },
+  { "bt601", VPX_CS_BT_601 },
+  { "bt709", VPX_CS_BT_709 },
+  { "smpte170", VPX_CS_SMPTE_170 },
+  { "smpte240", VPX_CS_SMPTE_240 },
+  { "bt2020", VPX_CS_BT_2020 },
+  { "reserved", VPX_CS_RESERVED },
+  { "sRGB", VPX_CS_SRGB },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+    NULL, "color-space", 1,
+    "The color space of input content:", color_space_enum);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
+    "b", "bit-depth", 1,
+    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
+    bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(
+    NULL, "input-bit-depth", 1, "Bit depth of input");
+#endif
 
 static const struct arg_enum_list tune_content_enum[] = {
   {"default", VP9E_CONTENT_DEFAULT},
@@ -386,30 +443,65 @@
 
 static const arg_def_t tune_content = ARG_DEF_ENUM(
     NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+#endif
 
+#if CONFIG_VP9_ENCODER
 static const arg_def_t *vp9_args[] = {
-  &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
+  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
-  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+  &gf_cbr_boost_pct, &lossless,
+  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
+  &noise_sens, &tune_content, &input_color_space,
+  &min_gf_interval, &max_gf_interval,
   NULL
 };
 static const int vp9_arg_ctrl_map[] = {
   VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
-  VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
+  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
   VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
-  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_TUNE_CONTENT,
+  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+  0
+};
+#endif
+
+#if CONFIG_VP10_ENCODER
+static const arg_def_t *vp10_args[] = {
+  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
+  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+  &gf_cbr_boost_pct, &lossless,
+  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
+  &noise_sens, &tune_content, &input_color_space,
+  &min_gf_interval, &max_gf_interval,
+  NULL
+};
+static const int vp10_arg_ctrl_map[] = {
+  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
+  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
+  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
+  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
+  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
+  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
   0
 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
 
-void usage_exit() {
+void usage_exit(void) {
   int i;
+  const int num_encoder = get_vpx_encoder_count();
 
   fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
           exec_name);
@@ -432,21 +524,125 @@
   fprintf(stderr, "\nVP9 Specific Options:\n");
   arg_show_usage(stderr, vp9_args);
 #endif
+#if CONFIG_VP10_ENCODER
+  fprintf(stderr, "\nVP10 Specific Options:\n");
+  arg_show_usage(stderr, vp10_args);
+#endif
   fprintf(stderr, "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
   fprintf(stderr, "\nIncluded encoders:\n\n");
 
-  for (i = 0; i < get_vpx_encoder_count(); ++i) {
+  for (i = 0; i < num_encoder; ++i) {
     const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n",
-            encoder->name, vpx_codec_iface_name(encoder->codec_interface()));
+    const char* defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+      fprintf(stderr, "    %-6s - %s %s\n",
+              encoder->name, vpx_codec_iface_name(encoder->codec_interface()),
+              defstr);
   }
+  fprintf(stderr, "\n        ");
+  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
 
   exit(EXIT_FAILURE);
 }
 
 #define mmin(a, b)  ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void find_mismatch_high(const vpx_image_t *const img1,
+                               const vpx_image_t *const img2,
+                               int yloc[4], int uloc[4], int vloc[4]) {
+  uint16_t *plane1, *plane2;
+  uint32_t stride1, stride2;
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y]/2;
+  stride2 = img2->stride[VPX_PLANE_Y]/2;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U]/2;
+  stride2 = img2->stride[VPX_PLANE_U]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V]/2;
+  stride2 = img2->stride[VPX_PLANE_V]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 static void find_mismatch(const vpx_image_t *const img1,
                           const vpx_image_t *const img2,
                           int yloc[4], int uloc[4], int vloc[4]) {
@@ -539,7 +735,8 @@
 
 static int compare_img(const vpx_image_t *const img1,
                        const vpx_image_t *const img2) {
-  const uint32_t c_w =
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
       (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
   const uint32_t c_h =
       (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
@@ -549,11 +746,17 @@
   match &= (img1->fmt == img2->fmt);
   match &= (img1->d_w == img2->d_w);
   match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+#endif
 
   for (i = 0; i < img1->d_h; ++i)
     match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
                      img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     img1->d_w) == 0);
+                     l_w) == 0);
 
   for (i = 0; i < c_h; ++i)
     match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
@@ -570,14 +773,12 @@
 
 
 #define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
-#define MAX(x,y) ((x)>(y)?(x):(y))
-#if CONFIG_VP8_ENCODER && !CONFIG_VP9_ENCODER
-#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
-#elif !CONFIG_VP8_ENCODER && CONFIG_VP9_ENCODER
+#if CONFIG_VP10_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp10_arg_ctrl_map)
+#elif CONFIG_VP9_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map)
 #else
-#define ARG_CTRL_CNT_MAX MAX(NELEMENTS(vp8_arg_ctrl_map), \
-                             NELEMENTS(vp9_arg_ctrl_map))
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
 #endif
 
 #if !CONFIG_WEBM_IO
@@ -598,6 +799,10 @@
   int                       arg_ctrl_cnt;
   int                       write_webm;
   int                       have_kf_max_dist;
+#if CONFIG_VP9_HIGHBITDEPTH
+  // whether to use 16bit internal buffers
+  int                       use_16bit_internal;
+#endif
 };
 
 
@@ -627,8 +832,8 @@
 };
 
 
-void validate_positive_rational(const char          *msg,
-                                struct vpx_rational *rat) {
+static void validate_positive_rational(const char          *msg,
+                                       struct vpx_rational *rat) {
   if (rat->den < 0) {
     rat->num *= -1;
     rat->den *= -1;
@@ -645,10 +850,14 @@
 static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   char       **argi, **argj;
   struct arg   arg;
+  const int num_encoder = get_vpx_encoder_count();
+
+  if (num_encoder < 1)
+    die("Error: no valid encoder available\n");
 
   /* Initialize default parameters */
   memset(global, 0, sizeof(*global));
-  global->codec = get_vpx_encoder_by_index(0);
+  global->codec = get_vpx_encoder_by_index(num_encoder - 1);
   global->passes = 0;
   global->color_type = I420;
   /* Assign default deadline to good quality */
@@ -690,6 +899,8 @@
       global->color_type = I422;
     else if (arg_match(&arg, &use_i444, argi))
       global->color_type = I444;
+    else if (arg_match(&arg, &use_i440, argi))
+      global->color_type = I440;
     else if (arg_match(&arg, &quietarg, argi))
       global->quiet = 1;
     else if (arg_match(&arg, &verbosearg, argi))
@@ -718,8 +929,6 @@
       global->disable_warnings = 1;
     else if (arg_match(&arg, &disable_warning_prompt, argi))
       global->disable_warning_prompt = 1;
-    else if (arg_match(&arg, &experimental_bitstream, argi))
-      global->experimental_bitstream = 1;
     else
       argj++;
   }
@@ -734,11 +943,12 @@
   }
   /* Validate global config */
   if (global->passes == 0) {
-#if CONFIG_VP9_ENCODER
+#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
     // Make default VP9 passes = 2 until there is a better quality 1-pass
     // encoder
-    global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
-                      global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                        global->deadline != VPX_DL_REALTIME) ? 2 : 1;
 #else
     global->passes = 1;
 #endif
@@ -752,7 +962,7 @@
 }
 
 
-void open_input_file(struct VpxInputContext *input) {
+static void open_input_file(struct VpxInputContext *input) {
   /* Parse certain options from the input file, if possible */
   input->file = strcmp(input->filename, "-")
       ? fopen(input->filename, "rb") : set_binary_mode(stdin);
@@ -768,6 +978,10 @@
     rewind(input->file);
   }
 
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
   /* For RAW input sources, these bytes will applied on the first frame
    *  in read_frame().
    */
@@ -781,6 +995,8 @@
       input->file_type = FILE_TYPE_Y4M;
       input->width = input->y4m.pic_w;
       input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
       input->framerate.numerator = input->y4m.fps_n;
       input->framerate.denominator = input->y4m.fps_d;
       input->fmt = input->y4m.vpx_fmt;
@@ -806,8 +1022,10 @@
   struct stream_state *stream;
 
   stream = calloc(1, sizeof(*stream));
-  if (!stream)
+  if (stream == NULL) {
     fatal("Failed to allocate new stream.");
+  }
+
   if (prev) {
     memcpy(stream, prev, sizeof(*stream));
     stream->index++;
@@ -867,6 +1085,9 @@
   static const int        *ctrl_args_map = NULL;
   struct stream_config    *config = &stream->config;
   int                      eos_mark_found = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int                      test_16bit_internal = 0;
+#endif
 
   // Handle codec specific options
   if (0) {
@@ -880,6 +1101,13 @@
     ctrl_args = vp9_args;
     ctrl_args_map = vp9_arg_ctrl_map;
 #endif
+#if CONFIG_VP10_ENCODER
+  } else if (strcmp(global->codec->name, "vp10") == 0) {
+    // TODO(jingning): Reuse VP9 specific encoder configuration parameters.
+    // Consider to expand this set for VP10 encoder control.
+    ctrl_args = vp10_args;
+    ctrl_args_map = vp10_arg_ctrl_map;
+#endif
   }
 
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
@@ -896,8 +1124,7 @@
       continue;
     }
 
-    if (0) {
-    } else if (arg_match(&arg, &outputfile, argi)) {
+    if (arg_match(&arg, &outputfile, argi)) {
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
@@ -905,6 +1132,12 @@
     } else if (arg_match(&arg, &fpmbf_name, argi)) {
       config->fpmb_stats_fn = arg.val;
 #endif
+    } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+      config->write_webm = 1;
+#else
+      die("Error: --webm specified but webm is disabled.");
+#endif
     } else if (arg_match(&arg, &use_ivf, argi)) {
       config->write_webm = 0;
     } else if (arg_match(&arg, &threads, argi)) {
@@ -915,6 +1148,12 @@
       config->cfg.g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+      config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+      config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+#endif
 #if CONFIG_WEBM_IO
     } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -982,6 +1221,13 @@
       config->have_kf_max_dist = 1;
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
+      if (strcmp(global->codec->name, "vp9") == 0 ||
+          strcmp(global->codec->name, "vp10") == 0) {
+        test_16bit_internal = 1;
+      }
+#endif
     } else {
       int i, match = 0;
       for (i = 0; ctrl_args[i]; i++) {
@@ -993,24 +1239,31 @@
           * instance of this control.
           */
           for (j = 0; j < config->arg_ctrl_cnt; j++)
-            if (config->arg_ctrls[j][0] == ctrl_args_map[i])
+            if (ctrl_args_map != NULL &&
+                config->arg_ctrls[j][0] == ctrl_args_map[i])
               break;
 
           /* Update/insert */
           assert(j < (int)ARG_CTRL_CNT_MAX);
-          if (j < (int)ARG_CTRL_CNT_MAX) {
+          if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
             config->arg_ctrls[j][0] = ctrl_args_map[i];
             config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
             if (j == config->arg_ctrl_cnt)
               config->arg_ctrl_cnt++;
           }
-
         }
       }
       if (!match)
         argj++;
     }
   }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (strcmp(global->codec->name, "vp9") == 0 ||
+      strcmp(global->codec->name, "vp10") == 0) {
+    config->use_16bit_internal = test_16bit_internal |
+                                 (config->cfg.g_profile > 1);
+  }
+#endif
   return eos_mark_found;
 }
 
@@ -1027,15 +1280,18 @@
 static void validate_stream_config(const struct stream_state *stream,
                                    const struct VpxEncoderConfig *global) {
   const struct stream_state *streami;
+  (void)global;
 
   if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
     fatal("Stream %d: Specify stream dimensions with --width (-w) "
           " and --height (-h)", stream->index);
 
-  if (stream->config.cfg.g_profile != 0 && !global->experimental_bitstream) {
-    fatal("Stream %d: profile %d is experimental and requires the --%s flag",
-          stream->index, stream->config.cfg.g_profile,
-          experimental_bitstream.long_name);
+  // Check that the codec bit depth is greater than the input bit depth.
+  if (stream->config.cfg.g_input_bit_depth >
+      (unsigned int)stream->config.cfg.g_bit_depth) {
+    fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+          stream->index, (int)stream->config.cfg.g_bit_depth,
+          stream->config.cfg.g_input_bit_depth);
   }
 
   for (streami = stream; streami; streami = streami->next) {
@@ -1116,7 +1372,12 @@
     case VPX_IMG_FMT_I420: return "I420";
     case VPX_IMG_FMT_I422: return "I422";
     case VPX_IMG_FMT_I444: return "I444";
+    case VPX_IMG_FMT_I440: return "I440";
     case VPX_IMG_FMT_YV12: return "YV12";
+    case VPX_IMG_FMT_I42016: return "I42016";
+    case VPX_IMG_FMT_I42216: return "I42216";
+    case VPX_IMG_FMT_I44416: return "I44416";
+    case VPX_IMG_FMT_I44016: return "I44016";
     default: return "Other";
   }
 }
@@ -1146,6 +1407,8 @@
   SHOW(g_profile);
   SHOW(g_w);
   SHOW(g_h);
+  SHOW(g_bit_depth);
+  SHOW(g_input_bit_depth);
   SHOW(g_timebase.num);
   SHOW(g_timebase.den);
   SHOW(g_error_resilient);
@@ -1176,7 +1439,8 @@
 
 
 static void open_output_file(struct stream_state *stream,
-                             struct VpxEncoderConfig *global) {
+                             struct VpxEncoderConfig *global,
+                             const struct VpxRational *pixel_aspect_ratio) {
   const char *fn = stream->config.out_fn;
   const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
 
@@ -1197,7 +1461,8 @@
     write_webm_file_header(&stream->ebml, cfg,
                            &global->framerate,
                            stream->config.stereo_fmt,
-                           global->codec->fourcc);
+                           global->codec->fourcc,
+                           pixel_aspect_ratio);
   }
 #endif
 
@@ -1278,6 +1543,9 @@
 
   flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
   flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0;
+#endif
 
   /* Construct Encoder Context */
   vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
@@ -1323,6 +1591,46 @@
                      / cfg->g_timebase.num / global->framerate.num;
 
   /* Scale if necessary */
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img) {
+    if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
+        (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+      if (img->fmt != VPX_IMG_FMT_I42016) {
+        fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+        exit(EXIT_FAILURE);
+      }
+#if CONFIG_LIBYUV
+      if (!stream->img) {
+        stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
+                                    cfg->g_w, cfg->g_h, 16);
+      }
+      I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
+                   img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)img->planes[VPX_PLANE_U],
+                   img->stride[VPX_PLANE_U]/2,
+                   (uint16*)img->planes[VPX_PLANE_V],
+                   img->stride[VPX_PLANE_V]/2,
+                   img->d_w, img->d_h,
+                   (uint16*)stream->img->planes[VPX_PLANE_Y],
+                   stream->img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_U],
+                   stream->img->stride[VPX_PLANE_U]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_V],
+                   stream->img->stride[VPX_PLANE_V]/2,
+                   stream->img->d_w, stream->img->d_h,
+                   kFilterBox);
+      img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+    }
+  }
+#endif
   if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
     if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
       fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
@@ -1478,7 +1786,7 @@
 }
 
 
-static void show_psnr(struct stream_state  *stream) {
+static void show_psnr(struct stream_state  *stream, double peak) {
   int i;
   double ovpsnr;
 
@@ -1486,7 +1794,7 @@
     return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, 255.0,
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
                        (double)stream->psnr_sse_total);
   fprintf(stderr, " %.3f", ovpsnr);
 
@@ -1501,7 +1809,6 @@
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
 
-
 static void test_decode(struct stream_state  *stream,
                         enum TestDecodeFatality fatal,
                         const VpxInterface *codec) {
@@ -1527,20 +1834,44 @@
     vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
     vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
   } else {
-    struct vp9_ref_frame ref;
+    struct vp9_ref_frame ref_enc, ref_dec;
 
-    ref.idx = 0;
-    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);
-    enc_img = ref.img;
-    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);
-    dec_img = ref.img;
+    ref_enc.idx = 0;
+    ref_dec.idx = 0;
+    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
+    enc_img = ref_enc.img;
+    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
+    dec_img = ref_dec.img;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+        (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+      if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      enc_img.d_w, enc_img.d_h, 16);
+        vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+      }
+      if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      dec_img.d_w, dec_img.d_h, 16);
+        vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+      }
+    }
+#endif
   }
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
   if (!compare_img(&enc_img, &dec_img)) {
     int y[4], u[4], v[4];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
     find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
     stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d at"
@@ -1582,6 +1913,12 @@
 int main(int argc, const char **argv_) {
   int pass;
   vpx_image_t raw;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t raw_shift;
+  int allocated_raw_shift = 0;
+  int use_16bit_internal = 0;
+  int input_shift = 0;
+#endif
   int frame_avail, got_data;
 
   struct VpxInputContext input;
@@ -1621,6 +1958,9 @@
     case I444:
       input.fmt = VPX_IMG_FMT_I444;
       break;
+    case I440:
+      input.fmt = VPX_IMG_FMT_I440;
+      break;
     case YV12:
       input.fmt = VPX_IMG_FMT_YV12;
       break;
@@ -1670,19 +2010,41 @@
     /* If the input file doesn't specify its w/h (raw files), try to get
      * the data from the first stream's configuration.
      */
-    if (!input.width || !input.height)
-      FOREACH_STREAM( {
-      if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-        input.width = stream->config.cfg.g_w;
-        input.height = stream->config.cfg.g_h;
-        break;
-      }
-    });
+    if (!input.width || !input.height) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+          input.width = stream->config.cfg.g_w;
+          input.height = stream->config.cfg.g_h;
+          break;
+        }
+      });
+    }
 
     /* Update stream configurations from the input file's parameters */
     if (!input.width || !input.height)
       fatal("Specify stream dimensions with --width (-w) "
             " and --height (-h)");
+
+    /* If input file does not specify bit-depth but input-bit-depth parameter
+     * exists, assume that to be the input bit-depth. However, if the
+     * input-bit-depth paramter does not exist, assume the input bit-depth
+     * to be the same as the codec bit-depth.
+     */
+    if (!input.bit_depth) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_input_bit_depth)
+          input.bit_depth = stream->config.cfg.g_input_bit_depth;
+        else
+          input.bit_depth = stream->config.cfg.g_input_bit_depth =
+              (int)stream->config.cfg.g_bit_depth;
+      });
+      if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    } else {
+      FOREACH_STREAM({
+        stream->config.cfg.g_input_bit_depth = input.bit_depth;
+      });
+    }
+
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream, &global));
 
@@ -1733,9 +2095,30 @@
     }
 
     FOREACH_STREAM(setup_pass(stream, &global, pass));
-    FOREACH_STREAM(open_output_file(stream, &global));
+    FOREACH_STREAM(open_output_file(stream, &global,
+                                    &input.pixel_aspect_ratio));
     FOREACH_STREAM(initialize_encoder(stream, &global));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (strcmp(global.codec->name, "vp9") == 0 ||
+        strcmp(global.codec->name, "vp10") == 0) {
+      // Check to see if at least one stream uses 16 bit internal.
+      // Currently assume that the bit_depths for all streams using
+      // highbitdepth are the same.
+      FOREACH_STREAM({
+        if (stream->config.use_16bit_internal) {
+          use_16bit_internal = 1;
+        }
+        if (stream->config.cfg.g_profile == 0) {
+          input_shift = 0;
+        } else {
+          input_shift = (int)stream->config.cfg.g_bit_depth -
+              stream->config.cfg.g_input_bit_depth;
+        }
+      });
+    }
+#endif
+
     frame_avail = 1;
     got_data = 0;
 
@@ -1773,10 +2156,45 @@
         frame_avail = 0;
 
       if (frames_in > global.skip_frames) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        vpx_image_t *frame_to_encode;
+        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+          assert(use_16bit_internal);
+          // Input bit depth and stream bit depth do not match, so up
+          // shift frame to stream bit depth
+          if (!allocated_raw_shift) {
+            vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+                          input.width, input.height, 32);
+            allocated_raw_shift = 1;
+          }
+          vpx_img_upshift(&raw_shift, &raw, input_shift);
+          frame_to_encode = &raw_shift;
+        } else {
+          frame_to_encode = &raw;
+        }
+        vpx_usec_timer_start(&timer);
+        if (use_16bit_internal) {
+          assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH);
+          FOREACH_STREAM({
+            if (stream->config.use_16bit_internal)
+              encode_frame(stream, &global,
+                           frame_avail ? frame_to_encode : NULL,
+                           frames_in);
+            else
+              assert(0);
+          });
+        } else {
+          assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0);
+          FOREACH_STREAM(encode_frame(stream, &global,
+                                      frame_avail ? frame_to_encode : NULL,
+                                      frames_in));
+        }
+#else
         vpx_usec_timer_start(&timer);
         FOREACH_STREAM(encode_frame(stream, &global,
                                     frame_avail ? &raw : NULL,
                                     frames_in));
+#endif
         vpx_usec_timer_mark(&timer);
         cx_time += vpx_usec_timer_elapsed(&timer);
 
@@ -1785,7 +2203,8 @@
         got_data = 0;
         FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
 
-        if (!got_data && input.length && !streams->frames_out) {
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
           lagged_count = global.limit ? seen_frames : ftello(input.file);
         } else if (input.length) {
           int64_t remaining;
@@ -1824,24 +2243,29 @@
     if (stream_cnt > 1)
       fprintf(stderr, "\n");
 
-    if (!global.quiet)
-      FOREACH_STREAM(fprintf(
-                       stderr,
-                       "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7lub/f %7"PRId64"b/s"
-                       " %7"PRId64" %s (%.2f fps)\033[K\n", pass + 1,
-                       global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
-                       seen_frames ? (unsigned long)(stream->nbytes * 8 / seen_frames) : 0,
-                       seen_frames ? (int64_t)stream->nbytes * 8
-                       * (int64_t)global.framerate.num / global.framerate.den
-                       / seen_frames
-                       : 0,
-                       stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
-                       stream->cx_time > 9999999 ? "ms" : "us",
-                       usec_to_fps(stream->cx_time, seen_frames));
-                    );
+    if (!global.quiet) {
+      FOREACH_STREAM(fprintf(stderr,
+          "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7"PRId64"b/f %7"PRId64"b/s"
+          " %7"PRId64" %s (%.2f fps)\033[K\n",
+          pass + 1,
+          global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
+          seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
+          seen_frames ? (int64_t)stream->nbytes * 8 *
+              (int64_t)global.framerate.num / global.framerate.den /
+              seen_frames : 0,
+          stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
+          stream->cx_time > 9999999 ? "ms" : "us",
+          usec_to_fps(stream->cx_time, seen_frames)));
+    }
 
-    if (global.show_psnr)
-      FOREACH_STREAM(show_psnr(stream));
+    if (global.show_psnr) {
+      if (global.codec->fourcc == VP9_FOURCC) {
+        FOREACH_STREAM(
+            show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1));
+      } else {
+        FOREACH_STREAM(show_psnr(stream, 255.0));
+      }
+    }
 
     FOREACH_STREAM(vpx_codec_destroy(&stream->encoder));
 
@@ -1893,6 +2317,10 @@
     });
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (allocated_raw_shift)
+    vpx_img_free(&raw_shift);
+#endif
   vpx_img_free(&raw);
   free(argv);
   free(streams);

diff --git a/libvpx/vpxenc.h b/libvpx/vpxenc.h
index 3d6728e..d867e9d 100644
--- a/libvpx/vpxenc.h
+++ b/libvpx/vpxenc.h

@@ -26,6 +26,7 @@
   I420,  // 4:2:0 8+ bit-depth
   I422,  // 4:2:2 8+ bit-depth
   I444,  // 4:4:4 8+ bit-depth
+  I440,  // 4:4:0 8+ bit-depth
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
 } ColorInputType;
 

diff --git a/libvpx/vpxstats.c b/libvpx/vpxstats.c
index 5f88f8d..172d893 100644
--- a/libvpx/vpxstats.c
+++ b/libvpx/vpxstats.c

@@ -41,6 +41,9 @@
 
     stats->file = fopen(fpf, "rb");
 
+    if (stats->file == NULL)
+      fatal("First-pass stats file does not exist!");
+
     if (fseek(stats->file, 0, SEEK_END))
       fatal("First-pass stats file must be seekable!");
 

diff --git a/libvpx/webmdec.cc b/libvpx/webmdec.cc
index 4383e8e..1020d04 100644
--- a/libvpx/webmdec.cc
+++ b/libvpx/webmdec.cc

@@ -41,6 +41,7 @@
   webm_ctx->block_frame_index = 0;
   webm_ctx->video_track_index = 0;
   webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
 }
 
 void get_first_cluster(struct WebmInputContext *const webm_ctx) {
@@ -62,6 +63,7 @@
                  struct VpxInputContext *vpx_ctx) {
   mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
   webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
 
   mkvparser::EBMLHeader header;
   long long pos = 0;
@@ -101,6 +103,8 @@
     vpx_ctx->fourcc = VP8_FOURCC;
   } else if (!strncmp(video_track->GetCodecId(), "V_VP9", 5)) {
     vpx_ctx->fourcc = VP9_FOURCC;
+  } else if (!strncmp(video_track->GetCodecId(), "V_VP10", 6)) {
+    vpx_ctx->fourcc = VP10_FOURCC;
   } else {
     rewind_and_reset(webm_ctx, vpx_ctx);
     return 0;
@@ -120,6 +124,11 @@
                     uint8_t **buffer,
                     size_t *bytes_in_buffer,
                     size_t *buffer_size) {
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
   mkvparser::Segment *const segment =
       reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
   const mkvparser::Cluster* cluster =
@@ -139,6 +148,7 @@
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
         *bytes_in_buffer = 0;
+        webm_ctx->reached_eos = 1;
         return 1;
       }
       status = cluster->GetFirst(block_entry);
@@ -182,6 +192,7 @@
   }
   *bytes_in_buffer = frame.len;
   webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();
 
   mkvparser::MkvReader *const reader =
       reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
@@ -210,6 +221,7 @@
   webm_ctx->block_entry = NULL;
   webm_ctx->block_frame_index = 0;
   webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
 
   return 0;
 }

diff --git a/libvpx/webmdec.h b/libvpx/webmdec.h
index 29b815d..7d16380 100644
--- a/libvpx/webmdec.h
+++ b/libvpx/webmdec.h

@@ -28,6 +28,8 @@
   int block_frame_index;
   int video_track_index;
   uint64_t timestamp_ns;
+  int is_key_frame;
+  int reached_eos;
 };
 
 // Checks if the input is a WebM file. If so, initializes WebMInputContext so

diff --git a/libvpx/webmenc.cc b/libvpx/webmenc.cc
index a0e542b..d41e700 100644
--- a/libvpx/webmenc.cc
+++ b/libvpx/webmenc.cc

@@ -24,7 +24,8 @@
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
-                            unsigned int fourcc) {
+                            unsigned int fourcc,
+                            const struct VpxRational *par) {
   mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(glob->stream);
   mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
   segment->Init(writer);
@@ -48,7 +49,31 @@
       static_cast<mkvmuxer::VideoTrack*>(
           segment->GetTrackByNumber(video_track_id));
   video_track->SetStereoMode(stereo_fmt);
-  video_track->set_codec_id(fourcc == VP8_FOURCC ? "V_VP8" : "V_VP9");
+  const char *codec_id;
+  switch (fourcc) {
+  case VP8_FOURCC:
+    codec_id = "V_VP8";
+    break;
+  case VP9_FOURCC:
+    codec_id = "V_VP9";
+    break;
+  case VP10_FOURCC:
+    codec_id = "V_VP10";
+    break;
+  default:
+    codec_id = "V_VP10";
+    break;
+  }
+  video_track->set_codec_id(codec_id);
+  if (par->numerator > 1 || par->denominator > 1) {
+    // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type
+    // to WebM format.
+    const uint64_t display_width =
+        static_cast<uint64_t>(((cfg->g_w * par->numerator * 1.0) /
+                               par->denominator) + .5);
+    video_track->set_display_width(display_width);
+    video_track->set_display_height(cfg->g_h);
+  }
   if (glob->debug) {
     video_track->set_uid(kDebugTrackUid);
   }

diff --git a/libvpx/webmenc.h b/libvpx/webmenc.h
index 0ac606b..c255d3d 100644
--- a/libvpx/webmenc.h
+++ b/libvpx/webmenc.h

@@ -42,7 +42,8 @@
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
-                            unsigned int fourcc);
+                            unsigned int fourcc,
+                            const struct VpxRational *par);
 
 void write_webm_block(struct EbmlGlobal *glob,
                       const vpx_codec_enc_cfg_t *cfg,

diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c
index 520c332..34ea96d 100644
--- a/libvpx/y4minput.c
+++ b/libvpx/y4minput.c

@@ -700,7 +700,7 @@
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
                    int only_420) {
-  char buffer[80];
+  char buffer[80] = {0};
   int  ret;
   int  i;
   /*Read until newline, or 80 cols, whichever happens first.*/
@@ -978,7 +978,9 @@
     _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
   else
     _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
-  _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
   return 0;
 }
 
@@ -1039,12 +1041,12 @@
   c_w *= bytes_per_sample;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] =
+  _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] =
       _y4m->pic_w * bytes_per_sample;
-  _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
-  _img->planes[PLANE_Y] = _y4m->dst_buf;
-  _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
-  _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
-  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+  _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w;
+  _img->planes[VPX_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }

diff --git a/lint_config.sh b/lint_config.sh
new file mode 100755
index 0000000..d57e451
--- /dev/null
+++ b/lint_config.sh

@@ -0,0 +1,109 @@
+#!/bin/bash -e
+#
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This script is used to compare vpx_config.h and vpx_config.asm to
+# verify the two files match.
+#
+# Arguments:
+#
+# -h - C Header file.
+# -a - ASM file.
+# -p - Print the options if correct.
+# -o - Output file.
+#
+# Usage:
+#
+# # Compare the two configuration files and output the final results.
+# ./lint_config.sh -h vpx_config.h -a vpx_config.asm -o libvpx.config -p
+
+export LC_ALL=C
+print_final="no"
+
+while getopts "h:a:o:p" flag
+do
+  if [ "$flag" = "h" ]; then
+    header_file=$OPTARG
+  elif [ "$flag" = "a" ]; then
+    asm_file=$OPTARG
+  elif [ "$flag" = "o" ]; then
+    out_file=$OPTARG
+  elif [ "$flag" = "p" ]; then
+    print_final="yes"
+  fi
+done
+
+if [ -z "$header_file" ]; then
+  echo "Header file not specified."
+  false
+  exit
+fi
+
+if [ -z "$asm_file" ]; then
+  echo "ASM file not specified."
+  false
+  exit
+fi
+
+# Concat header file and assembly file and select those ended with 0 or 1.
+combined_config="$(cat $header_file $asm_file | grep -E ' +[01] *$')"
+
+# Extra filtering for known exceptions.
+combined_config="$(echo "$combined_config" | grep -v DO1STROUNDING)"
+
+# Remove all spaces.
+combined_config="$(echo "$combined_config" | sed 's/[ \t]//g')"
+
+# Remove #define in the header file.
+combined_config="$(echo "$combined_config" | sed 's/.*define//')"
+
+# Remove equ in the ASM file.
+combined_config="$(echo "$combined_config" | sed 's/\.equ//')" # gas style
+combined_config="$(echo "$combined_config" | sed 's/equ//')" # rvds style
+
+# Remove %define in YASM ASM files.
+combined_config="$(echo "$combined_config" | sed 's/%define\s *//')" # yasm style
+
+# Remove useless comma in gas style assembly file.
+combined_config="$(echo "$combined_config" | sed 's/,//')"
+
+# Substitute 0 with =no.
+combined_config="$(echo "$combined_config" | sed 's/0$/=no/')"
+
+# Substitute 1 with =yes.
+combined_config="$(echo "$combined_config" | sed 's/1$/=yes/')"
+
+# Find the mismatch variables.
+odd_config="$(echo "$combined_config" | sort | uniq -u)"
+odd_vars="$(echo "$odd_config" | sed 's/=.*//' | uniq)"
+
+for var in $odd_vars; do
+  echo "Error: Configuration mismatch for $var."
+  echo "Header file: $header_file"
+  echo "$(cat -n $header_file | grep "$var[ \t]")"
+  echo "Assembly file: $asm_file"
+  echo "$(cat -n $asm_file | grep "$var[ \t]")"
+  echo ""
+done
+
+if [ -n "$odd_vars" ]; then
+  false
+  exit
+fi
+
+if [ "$print_final" = "no" ]; then
+  exit
+fi
+
+# Do some additional filter to make libvpx happy.
+combined_config="$(echo "$combined_config" | grep -v ARCH_X86=no)"
+combined_config="$(echo "$combined_config" | grep -v ARCH_X86_64=no)"
+
+# Print out the unique configurations.
+if [ -n "$out_file" ]; then
+  echo "$combined_config" | sort | uniq > $out_file
+else
+  echo "$combined_config" | sort | uniq
+fi

diff --git a/mips-dspr2/vp8_rtcd.h b/mips-dspr2/vp8_rtcd.h
deleted file mode 100644
index 7d4dc20..0000000
--- a/mips-dspr2/vp8_rtcd.h
+++ /dev/null

@@ -1,341 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_c
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
-void vp8_clear_system_state_c();
-#define vp8_clear_system_state vp8_clear_system_state_c
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_dspr2
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_dspr2
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_dspr2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_dspr2
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_dspr2(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_dspr2
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_c
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_dspr2(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_dspr2
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_dspr2
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_dspr2
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_c
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_c
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c
-
-unsigned int vp8_get_mb_ss_c(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_c
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_dspr2
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_dspr2
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_dspr2
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_dspr2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_dspr2
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_c
-
-int vp8_mbuverror_c(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_c
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_c
-
-void vp8_quantize_mb_c(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_c
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_c
-
-void vp8_quantize_mby_c(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_c
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_c
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_c
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_c
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_c
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_c
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_c
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_dspr2
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_dspr2(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_dspr2
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_dspr2
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_dspr2
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_dspr2
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_dspr2
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_dspr2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_dspr2
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_c
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_c
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_c
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_c
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_c
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_c
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_c
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_c
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_c
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
-void vp8_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-#if HAVE_DSPR2
-#if CONFIG_VP8
-void dsputil_static_init();
-dsputil_static_init();
-#endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
-#endif
-#endif
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
deleted file mode 100644
index ff122b8..0000000
--- a/mips-dspr2/vp9_rtcd.h
+++ /dev/null

@@ -1,741 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_dspr2
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_dspr2
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_dspr2
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_dspr2
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_dspr2
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_dspr2
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_dspr2
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_dspr2
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_dspr2
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_dspr2
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_dspr2
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_c
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_c
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_c
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_c
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_c
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_c
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_c
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_c
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_c
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_c
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_c
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_c
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_dspr2
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_dspr2
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_dspr2
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_dspr2
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_dspr2
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_dspr2
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_dspr2
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_dspr2
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_dspr2
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_dspr2
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_dspr2
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_dspr2
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_dspr2
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_dspr2
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_dspr2
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_dspr2
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_dspr2
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_dspr2
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_4_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_dspr2
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_dspr2
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_8_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_dspr2
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_dspr2
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_dspr2
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_dspr2
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_4_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_dspr2
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_dspr2
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_8_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_dspr2
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_dspr2
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_c
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_c
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_c
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_c
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_c
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_c
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_c
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_c
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_c
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_c
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_c
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_c
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_c
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_c
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_c
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_c
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_c
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_c
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_c
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_c
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_c
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_c
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_c
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_c
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_c
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_c
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_c
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_c
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_c
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_c
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_c
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_c
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_c
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_c
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_c
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_c
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_c
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_c
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_c
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_c
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_c
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_c
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_dspr2
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_dspr2
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_c
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_c
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_c
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_c
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_c
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_c
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_c
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_c
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_c
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_c
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_c
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_c
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_c
-
-void vp9_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-#if HAVE_DSPR2
-#if CONFIG_VP8
-void dsputil_static_init();
-dsputil_static_init();
-#endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
-#endif
-#endif
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/mips-dspr2/vpx_config.c b/mips-dspr2/vpx_config.c
deleted file mode 100644
index 2f8d8c3..0000000
--- a/mips-dspr2/vpx_config.c
+++ /dev/null

@@ -1,9 +0,0 @@
-/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
-/*  */
-/* Use of this source code is governed by a BSD-style license */
-/* that can be found in the LICENSE file in the root of the source */
-/* tree. An additional intellectual property rights grant can be found */
-/* in the file PATENTS.  All contributing project authors may */
-/* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/vigneshv/Downloads/android-ndk-r10 --enable-dspr2 --disable-examples --disable-docs --enable-realtime-only";
-const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/mips-dspr2/vpx_version.h b/mips-dspr2/vpx_version.h
deleted file mode 100644
index 59adf99..0000000
--- a/mips-dspr2/vpx_version.h
+++ /dev/null

@@ -1,7 +0,0 @@
-#define VERSION_MAJOR  1
-#define VERSION_MINOR  3
-#define VERSION_PATCH  0
-#define VERSION_EXTRA  "3825-gd4a47a6"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0-3825-gd4a47a6"
-#define VERSION_STRING      " v1.3.0-3825-gd4a47a6"

diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt
deleted file mode 100644
index 0e70976..0000000
--- a/mips/libvpx_srcs.txt
+++ /dev/null

@@ -1,308 +0,0 @@
-build/make/rtcd.pl
-build/make/version.sh
-CHANGELOG
-libs.mk
-vp8/common/alloccommon.c
-vp8/common/alloccommon.h
-vp8/common/blockd.c
-vp8/common/blockd.h
-vp8/common/coefupdateprobs.h
-vp8/common/common.h
-vp8/common/debugmodes.c
-vp8/common/default_coef_probs.h
-vp8/common/dequantize.c
-vp8/common/entropy.c
-vp8/common/entropy.h
-vp8/common/entropymode.c
-vp8/common/entropymode.h
-vp8/common/entropymv.c
-vp8/common/entropymv.h
-vp8/common/extend.c
-vp8/common/extend.h
-vp8/common/filter.c
-vp8/common/filter.h
-vp8/common/findnearmv.c
-vp8/common/findnearmv.h
-vp8/common/generic/systemdependent.c
-vp8/common/header.h
-vp8/common/idct_blk.c
-vp8/common/idctllm.c
-vp8/common/invtrans.h
-vp8/common/loopfilter.c
-vp8/common/loopfilter_filters.c
-vp8/common/loopfilter.h
-vp8/common/mbpitch.c
-vp8/common/modecont.c
-vp8/common/modecont.h
-vp8/common/mv.h
-vp8/common/onyxc_int.h
-vp8/common/onyxd.h
-vp8/common/onyx.h
-vp8/common/ppflags.h
-vp8/common/quant_common.c
-vp8/common/quant_common.h
-vp8/common/reconinter.c
-vp8/common/reconinter.h
-vp8/common/reconintra4x4.c
-vp8/common/reconintra4x4.h
-vp8/common/reconintra.c
-vp8/common/rtcd.c
-vp8/common/rtcd_defs.pl
-vp8/common/sad_c.c
-vp8/common/setupintrarecon.c
-vp8/common/setupintrarecon.h
-vp8/common/swapyv12buffer.c
-vp8/common/swapyv12buffer.h
-vp8/common/systemdependent.h
-vp8/common/threading.h
-vp8/common/treecoder.c
-vp8/common/treecoder.h
-vp8/common/variance_c.c
-vp8/common/variance.h
-vp8/common/vp8_entropymodedata.h
-vp8/decoder/dboolhuff.c
-vp8/decoder/dboolhuff.h
-vp8/decoder/decodeframe.c
-vp8/decoder/decodemv.c
-vp8/decoder/decodemv.h
-vp8/decoder/decoderthreading.h
-vp8/decoder/detokenize.c
-vp8/decoder/detokenize.h
-vp8/decoder/onyxd_if.c
-vp8/decoder/onyxd_int.h
-vp8/decoder/threading.c
-vp8/decoder/treereader.h
-vp8/encoder/bitstream.c
-vp8/encoder/bitstream.h
-vp8/encoder/block.h
-vp8/encoder/boolhuff.c
-vp8/encoder/boolhuff.h
-vp8/encoder/dct.c
-vp8/encoder/dct_value_cost.h
-vp8/encoder/dct_value_tokens.h
-vp8/encoder/defaultcoefcounts.h
-vp8/encoder/denoising.c
-vp8/encoder/denoising.h
-vp8/encoder/encodeframe.c
-vp8/encoder/encodeframe.h
-vp8/encoder/encodeintra.c
-vp8/encoder/encodeintra.h
-vp8/encoder/encodemb.c
-vp8/encoder/encodemb.h
-vp8/encoder/encodemv.c
-vp8/encoder/encodemv.h
-vp8/encoder/ethreading.c
-vp8/encoder/firstpass.h
-vp8/encoder/lookahead.c
-vp8/encoder/lookahead.h
-vp8/encoder/mcomp.c
-vp8/encoder/mcomp.h
-vp8/encoder/modecosts.c
-vp8/encoder/modecosts.h
-vp8/encoder/onyx_if.c
-vp8/encoder/onyx_int.h
-vp8/encoder/pickinter.c
-vp8/encoder/pickinter.h
-vp8/encoder/picklpf.c
-vp8/encoder/quantize.c
-vp8/encoder/quantize.h
-vp8/encoder/ratectrl.c
-vp8/encoder/ratectrl.h
-vp8/encoder/rdopt.c
-vp8/encoder/rdopt.h
-vp8/encoder/segmentation.c
-vp8/encoder/segmentation.h
-vp8/encoder/tokenize.c
-vp8/encoder/tokenize.h
-vp8/encoder/treewriter.c
-vp8/encoder/treewriter.h
-vp8/encoder/vp8_asm_enc_offsets.c
-vp8/vp8_common.mk
-vp8/vp8_cx_iface.c
-vp8/vp8cx.mk
-vp8/vp8_dx_iface.c
-vp8/vp8dx.mk
-vp9/common/vp9_alloccommon.c
-vp9/common/vp9_alloccommon.h
-vp9/common/vp9_blockd.c
-vp9/common/vp9_blockd.h
-vp9/common/vp9_common_data.c
-vp9/common/vp9_common_data.h
-vp9/common/vp9_common.h
-vp9/common/vp9_convolve.c
-vp9/common/vp9_convolve.h
-vp9/common/vp9_debugmodes.c
-vp9/common/vp9_entropy.c
-vp9/common/vp9_entropy.h
-vp9/common/vp9_entropymode.c
-vp9/common/vp9_entropymode.h
-vp9/common/vp9_entropymv.c
-vp9/common/vp9_entropymv.h
-vp9/common/vp9_enums.h
-vp9/common/vp9_filter.c
-vp9/common/vp9_filter.h
-vp9/common/vp9_frame_buffers.c
-vp9/common/vp9_frame_buffers.h
-vp9/common/vp9_idct.c
-vp9/common/vp9_idct.h
-vp9/common/vp9_loopfilter.c
-vp9/common/vp9_loopfilter_filters.c
-vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mv.h
-vp9/common/vp9_mvref_common.c
-vp9/common/vp9_mvref_common.h
-vp9/common/vp9_onyxc_int.h
-vp9/common/vp9_ppflags.h
-vp9/common/vp9_pred_common.c
-vp9/common/vp9_pred_common.h
-vp9/common/vp9_prob.c
-vp9/common/vp9_prob.h
-vp9/common/vp9_quant_common.c
-vp9/common/vp9_quant_common.h
-vp9/common/vp9_reconinter.c
-vp9/common/vp9_reconinter.h
-vp9/common/vp9_reconintra.c
-vp9/common/vp9_reconintra.h
-vp9/common/vp9_rtcd.c
-vp9/common/vp9_rtcd_defs.pl
-vp9/common/vp9_scale.c
-vp9/common/vp9_scale.h
-vp9/common/vp9_scan.c
-vp9/common/vp9_scan.h
-vp9/common/vp9_seg_common.c
-vp9/common/vp9_seg_common.h
-vp9/common/vp9_systemdependent.h
-vp9/common/vp9_textblit.h
-vp9/common/vp9_thread.c
-vp9/common/vp9_thread.h
-vp9/common/vp9_tile_common.c
-vp9/common/vp9_tile_common.h
-vp9/decoder/vp9_decodeframe.c
-vp9/decoder/vp9_decodeframe.h
-vp9/decoder/vp9_decodemv.c
-vp9/decoder/vp9_decodemv.h
-vp9/decoder/vp9_decoder.c
-vp9/decoder/vp9_decoder.h
-vp9/decoder/vp9_detokenize.c
-vp9/decoder/vp9_detokenize.h
-vp9/decoder/vp9_dsubexp.c
-vp9/decoder/vp9_dsubexp.h
-vp9/decoder/vp9_dthread.c
-vp9/decoder/vp9_dthread.h
-vp9/decoder/vp9_read_bit_buffer.c
-vp9/decoder/vp9_read_bit_buffer.h
-vp9/decoder/vp9_reader.c
-vp9/decoder/vp9_reader.h
-vp9/encoder/vp9_aq_complexity.c
-vp9/encoder/vp9_aq_complexity.h
-vp9/encoder/vp9_aq_cyclicrefresh.c
-vp9/encoder/vp9_aq_cyclicrefresh.h
-vp9/encoder/vp9_aq_variance.c
-vp9/encoder/vp9_aq_variance.h
-vp9/encoder/vp9_bitstream.c
-vp9/encoder/vp9_bitstream.h
-vp9/encoder/vp9_block.h
-vp9/encoder/vp9_context_tree.c
-vp9/encoder/vp9_context_tree.h
-vp9/encoder/vp9_cost.c
-vp9/encoder/vp9_cost.h
-vp9/encoder/vp9_dct.c
-vp9/encoder/vp9_encodeframe.c
-vp9/encoder/vp9_encodeframe.h
-vp9/encoder/vp9_encodemb.c
-vp9/encoder/vp9_encodemb.h
-vp9/encoder/vp9_encodemv.c
-vp9/encoder/vp9_encodemv.h
-vp9/encoder/vp9_encoder.c
-vp9/encoder/vp9_encoder.h
-vp9/encoder/vp9_extend.c
-vp9/encoder/vp9_extend.h
-vp9/encoder/vp9_firstpass.c
-vp9/encoder/vp9_firstpass.h
-vp9/encoder/vp9_lookahead.c
-vp9/encoder/vp9_lookahead.h
-vp9/encoder/vp9_mbgraph.c
-vp9/encoder/vp9_mbgraph.h
-vp9/encoder/vp9_mcomp.c
-vp9/encoder/vp9_mcomp.h
-vp9/encoder/vp9_picklpf.c
-vp9/encoder/vp9_picklpf.h
-vp9/encoder/vp9_pickmode.c
-vp9/encoder/vp9_pickmode.h
-vp9/encoder/vp9_quantize.c
-vp9/encoder/vp9_quantize.h
-vp9/encoder/vp9_ratectrl.c
-vp9/encoder/vp9_ratectrl.h
-vp9/encoder/vp9_rd.c
-vp9/encoder/vp9_rd.h
-vp9/encoder/vp9_rdopt.c
-vp9/encoder/vp9_rdopt.h
-vp9/encoder/vp9_resize.c
-vp9/encoder/vp9_resize.h
-vp9/encoder/vp9_sad.c
-vp9/encoder/vp9_segmentation.c
-vp9/encoder/vp9_segmentation.h
-vp9/encoder/vp9_speed_features.c
-vp9/encoder/vp9_speed_features.h
-vp9/encoder/vp9_subexp.c
-vp9/encoder/vp9_subexp.h
-vp9/encoder/vp9_svc_layercontext.c
-vp9/encoder/vp9_svc_layercontext.h
-vp9/encoder/vp9_temporal_filter.c
-vp9/encoder/vp9_temporal_filter.h
-vp9/encoder/vp9_tokenize.c
-vp9/encoder/vp9_tokenize.h
-vp9/encoder/vp9_treewriter.c
-vp9/encoder/vp9_treewriter.h
-vp9/encoder/vp9_variance.c
-vp9/encoder/vp9_variance.h
-vp9/encoder/vp9_write_bit_buffer.c
-vp9/encoder/vp9_write_bit_buffer.h
-vp9/encoder/vp9_writer.c
-vp9/encoder/vp9_writer.h
-vp9/vp9_common.mk
-vp9/vp9_cx_iface.c
-vp9/vp9cx.mk
-vp9/vp9_dx_iface.c
-vp9/vp9dx.mk
-vp9/vp9_iface_common.h
-vpx_config.c
-vpx/internal/vpx_codec_internal.h
-vpx/internal/vpx_psnr.h
-vpx_mem/include/vpx_mem_intrnl.h
-vpx_mem/vpx_mem.c
-vpx_mem/vpx_mem.h
-vpx_mem/vpx_mem.mk
-vpx_ports/asm_offsets.h
-vpx_ports/emmintrin_compat.h
-vpx_ports/mem.h
-vpx_ports/mem_ops_aligned.h
-vpx_ports/mem_ops.h
-vpx_ports/vpx_once.h
-vpx_ports/vpx_ports.mk
-vpx_ports/vpx_timer.h
-vpx_scale/generic/gen_scalers.c
-vpx_scale/generic/vpx_scale.c
-vpx_scale/generic/yv12config.c
-vpx_scale/generic/yv12extend.c
-vpx_scale/vpx_scale_asm_offsets.c
-vpx_scale/vpx_scale.h
-vpx_scale/vpx_scale.mk
-vpx_scale/vpx_scale_rtcd.c
-vpx_scale/vpx_scale_rtcd.pl
-vpx_scale/yv12config.h
-vpx/src/vpx_codec.c
-vpx/src/vpx_decoder.c
-vpx/src/vpx_encoder.c
-vpx/src/vpx_image.c
-vpx/src/vpx_psnr.c
-vpx/vp8cx.h
-vpx/vp8dx.h
-vpx/vp8.h
-vpx/vpx_codec.h
-vpx/vpx_codec.mk
-vpx/vpx_decoder.h
-vpx/vpx_encoder.h
-vpx/vpx_frame_buffer.h
-vpx/vpx_image.h
-vpx/vpx_integer.h

diff --git a/mips/vp8_rtcd.h b/mips/vp8_rtcd.h
deleted file mode 100644
index 35c7423..0000000
--- a/mips/vp8_rtcd.h
+++ /dev/null

@@ -1,323 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_c
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c
-
-void vp8_clear_system_state_c();
-#define vp8_clear_system_state vp8_clear_system_state_c
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_c
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_c
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_c
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_c
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c
-
-unsigned int vp8_get_mb_ss_c(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_c
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_c
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_c
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_c
-
-int vp8_mbuverror_c(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_c
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_c
-
-void vp8_quantize_mb_c(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_c
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_c
-
-void vp8_quantize_mby_c(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_c
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_c
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_c
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_c
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_c
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_c
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_c
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_c
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_c
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_c
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_c
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_c
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_c
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_c
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_c
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_c
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
-void vp8_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-#if HAVE_DSPR2
-#if CONFIG_VP8
-void dsputil_static_init();
-dsputil_static_init();
-#endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
-#endif
-#endif
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h
deleted file mode 100644
index f4efe1e..0000000
--- a/mips/vp9_rtcd.h
+++ /dev/null

@@ -1,700 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_c
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_c
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_c
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_c
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_c
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_c
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_c
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_c
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_c
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_c
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_c
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_c
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_c
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_c
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_c
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_c
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_c
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_c
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_c
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_c
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_c
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_c
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_c
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_c
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_c
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_c
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_c
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_c
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_c
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_c
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_c
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_c
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_c
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_c
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_c
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_c
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_c
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_c
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_c
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_c
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_c
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_c
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_c
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_c
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_c
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_c
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_c
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_c
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_c
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_c
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_c
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_c
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_c
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_c
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_c
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_c
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_c
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_c
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_c
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_c
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_c
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_c
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_c
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_c
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_c
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_c
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_c
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_c
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_c
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_c
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_c
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_c
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_c
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_c
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_c
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_c
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_c
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_c
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_c
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_c
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_c
-
-void vp9_rtcd(void);
-
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-#if HAVE_DSPR2
-#if CONFIG_VP8
-void dsputil_static_init();
-dsputil_static_init();
-#endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
-#endif
-#endif
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/mips/vpx_config.c b/mips/vpx_config.c
deleted file mode 100644
index 7e10d74..0000000
--- a/mips/vpx_config.c
+++ /dev/null

@@ -1,9 +0,0 @@
-/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
-/*  */
-/* Use of this source code is governed by a BSD-style license */
-/* that can be found in the LICENSE file in the root of the source */
-/* tree. An additional intellectual property rights grant can be found */
-/* in the file PATENTS.  All contributing project authors may */
-/* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/vigneshv/Downloads/android-ndk-r10 --disable-examples --disable-docs --enable-realtime-only";
-const char *vpx_codec_build_config(void) {return cfg;}

diff --git a/mips/vpx_version.h b/mips/vpx_version.h
deleted file mode 100644
index 59adf99..0000000
--- a/mips/vpx_version.h
+++ /dev/null

@@ -1,7 +0,0 @@
-#define VERSION_MAJOR  1
-#define VERSION_MINOR  3
-#define VERSION_PATCH  0
-#define VERSION_EXTRA  "3825-gd4a47a6"
-#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.3.0-3825-gd4a47a6"
-#define VERSION_STRING      " v1.3.0-3825-gd4a47a6"

diff --git a/update_libvpx.sh b/update_libvpx.sh
new file mode 100755
index 0000000..92e40eb
--- /dev/null
+++ b/update_libvpx.sh

@@ -0,0 +1,123 @@
+#!/bin/bash -e
+#
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This tool is used to update libvpx source code to a revision of the upstream
+# repository. Modified from Chromium src/third_party/libvpx/update_libvpx.sh
+
+# Usage:
+#
+# $ ./update_libvpx.sh [branch | revision | file or url containing a revision]
+# When specifying a branch it must be prefixed with origin/
+
+# Tools required for running this tool:
+#
+# 1. Linux / Mac
+# 2. git
+
+export LC_ALL=C
+
+# Location for the remote git repository.
+GIT_REPO="https://chromium.googlesource.com/webm/libvpx"
+
+# Update to TOT by default.
+GIT_BRANCH="origin/master"
+
+# Relative path of target checkout.
+LIBVPX_SRC_DIR="libvpx"
+
+BASE_DIR=`pwd`
+
+if [ -n "$1" ]; then
+  GIT_BRANCH="$1"
+  if [ -f "$1"  ]; then
+    GIT_BRANCH=$(<"$1")
+  elif [[ $1 = http* ]]; then
+    GIT_BRANCH=`curl $1`
+  fi
+fi
+
+prev_hash="$(egrep "^Commit: [[:alnum:]]" README.android | awk '{ print $2 }')"
+echo "prev_hash:$prev_hash"
+
+rm -rf $LIBVPX_SRC_DIR
+mkdir $LIBVPX_SRC_DIR
+cd $LIBVPX_SRC_DIR
+
+# Start a local git repo.
+git clone $GIT_REPO .
+
+# Switch the content to the desired revision.
+git checkout -b tot $GIT_BRANCH
+
+add="$(git diff-index --diff-filter=A $prev_hash | \
+tr -s [:blank:] ' ' | cut -f6 -d\ )"
+delete="$(git diff-index --diff-filter=D $prev_hash | \
+tr -s [:blank:] ' ' | cut -f6 -d\ )"
+
+# Get the current commit hash.
+hash=$(git log -1 --format="%H")
+
+# README reminder.
+echo "Update README.android:"
+echo "==============="
+echo "Date: $(date +"%A %B %d %Y")"
+echo "Branch: $GIT_BRANCH"
+echo "Commit: $hash"
+echo "==============="
+echo ""
+
+# Commit message header.
+echo "Commit message:"
+echo "==============="
+echo "libvpx: Pull from upstream"
+echo ""
+
+# Output the current commit hash.
+echo "Current HEAD: $hash"
+echo ""
+
+# Output log for upstream from current hash.
+if [ -n "$prev_hash" ]; then
+  echo "git log from upstream:"
+  pretty_git_log="$(git log \
+                    --no-merges \
+                    --topo-order \
+                    --pretty="%h %s" \
+                    --max-count=20 \
+                    $prev_hash..$hash)"
+  if [ -z "$pretty_git_log" ]; then
+    echo "No log found. Checking for reverts."
+    pretty_git_log="$(git log \
+                      --no-merges \
+                      --topo-order \
+                      --pretty="%h %s" \
+                      --max-count=20 \
+                      $hash..$prev_hash)"
+  fi
+  echo "$pretty_git_log"
+  # If it makes it to 20 then it's probably skipping even more.
+  if [ `echo "$pretty_git_log" | wc -l` -eq 20 ]; then
+    echo "<...>"
+  fi
+fi
+
+# Commit message footer.
+echo ""
+echo "==============="
+
+# Git is useless now, remove the local git repo.
+rm -rf .git .gitignore .gitattributes
+
+# Add and remove files.
+echo "$add" | xargs -I {} git add {}
+echo "$delete" | xargs -I {} git rm {}
+
+# Find empty directories and remove them.
+find . -type d -empty -exec git rm {} \;
+
+chmod 755 build/make/*.sh build/make/*.pl configure
+
+cd $BASE_DIR

diff --git a/x86/vp8_rtcd.h b/x86/vp8_rtcd.h
deleted file mode 100644
index c030275..0000000
--- a/x86/vp8_rtcd.h
+++ /dev/null

@@ -1,468 +0,0 @@
-#ifndef VP8_RTCD_H_
-#define VP8_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_sse2
-
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
-
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
-
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_sse2
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
-
-int vp8_block_error_c(short *coeff, short *dqcoeff);
-int vp8_block_error_mmx(short *coeff, short *dqcoeff);
-int vp8_block_error_xmm(short *coeff, short *dqcoeff);
-#define vp8_block_error vp8_block_error_xmm
-
-void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride);
-#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2
-
-void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride);
-#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_sse2
-
-void vp8_clear_system_state_c();
-void vpx_reset_mmx_state();
-#define vp8_clear_system_state vpx_reset_mmx_state
-
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n);
-#define vp8_copy32xn vp8_copy32xn_sse2
-
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
-
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
-
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
-
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
-
-int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter vp8_denoiser_filter_sse2
-
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
-#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
-
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
-#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
-
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
-#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
-
-void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
-#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
-
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
-#define vp8_dequantize_b vp8_dequantize_b_mmx
-
-int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_diamond_search_sad vp8_diamond_search_sad_c
-
-void vp8_fast_quantize_b_c(struct block *, struct blockd *);
-void vp8_fast_quantize_b_sse2(struct block *, struct blockd *);
-#define vp8_fast_quantize_b vp8_fast_quantize_b_sse2
-
-void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c
-
-void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
-void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
-#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2
-
-void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
-#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c
-
-void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
-void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
-#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
-
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx
-
-unsigned int vp8_get_mb_ss_c(const short *);
-unsigned int vp8_get_mb_ss_mmx(const short *);
-unsigned int vp8_get_mb_ss_sse2(const short *);
-#define vp8_get_mb_ss vp8_get_mb_ss_sse2
-
-void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left);
-#define vp8_intra4x4_predict vp8_intra4x4_predict_c
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
-
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
-
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
-
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
-
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
-
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
-
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
-
-int vp8_mbblock_error_c(struct macroblock *mb, int dc);
-int vp8_mbblock_error_mmx(struct macroblock *mb, int dc);
-int vp8_mbblock_error_xmm(struct macroblock *mb, int dc);
-#define vp8_mbblock_error vp8_mbblock_error_xmm
-
-void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm
-
-void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-#define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm
-
-int vp8_mbuverror_c(struct macroblock *mb);
-int vp8_mbuverror_mmx(struct macroblock *mb);
-int vp8_mbuverror_xmm(struct macroblock *mb);
-#define vp8_mbuverror vp8_mbuverror_xmm
-
-unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_mse16x16 vp8_mse16x16_wmt
-
-void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
-void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
-void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch);
-#define vp8_plane_add_noise vp8_plane_add_noise_wmt
-
-void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
-#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2
-
-void vp8_quantize_mb_c(struct macroblock *);
-#define vp8_quantize_mb vp8_quantize_mb_c
-
-void vp8_quantize_mbuv_c(struct macroblock *);
-#define vp8_quantize_mbuv vp8_quantize_mbuv_c
-
-void vp8_quantize_mby_c(struct macroblock *);
-#define vp8_quantize_mby vp8_quantize_mby_c
-
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_refining_search_sad vp8_refining_search_sad_c
-
-void vp8_regular_quantize_b_c(struct block *, struct blockd *);
-void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
-#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2
-
-void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2);
-#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x16 vp8_sad16x16_wmt
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x3 vp8_sad16x16x3_c
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x16x4d vp8_sad16x16x4d_c
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x16x8 vp8_sad16x16x8_c
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad16x8 vp8_sad16x8_wmt
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x3 vp8_sad16x8x3_c
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad16x8x4d vp8_sad16x8x4d_c
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad16x8x8 vp8_sad16x8x8_c
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad4x4 vp8_sad4x4_wmt
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x3 vp8_sad4x4x3_c
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad4x4x4d vp8_sad4x4x4d_c
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad4x4x8 vp8_sad4x4x8_c
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x16 vp8_sad8x16_wmt
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x3 vp8_sad8x16x3_c
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x16x4d vp8_sad8x16x4d_c
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x16x8 vp8_sad8x16x8_c
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad);
-#define vp8_sad8x8 vp8_sad8x8_wmt
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x3 vp8_sad8x8x3_c
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp8_sad8x8x4d vp8_sad8x8x4d_c
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array);
-#define vp8_sad8x8x8 vp8_sad8x8x8_c
-
-void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
-void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch);
-#define vp8_short_fdct4x4 vp8_short_fdct4x4_sse2
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
-#define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
-
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
-
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_mmx(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
-#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
-
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
-#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
-
-void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
-#define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
-
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_sse2
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_mmx
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_sse2
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_sse2
-
-unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt
-
-unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_wmt
-
-unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_wmt
-
-unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt
-
-unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt
-
-unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse);
-#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt
-
-void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch);
-void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch);
-void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch);
-#define vp8_subtract_b vp8_subtract_b_sse2
-
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride);
-#define vp8_subtract_mbuv vp8_subtract_mbuv_sse2
-
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride);
-#define vp8_subtract_mby vp8_subtract_mby_sse2
-
-unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x16 vp8_variance16x16_wmt
-
-unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance16x8 vp8_variance16x8_wmt
-
-unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance4x4 vp8_variance4x4_wmt
-
-unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x16 vp8_variance8x16_wmt
-
-unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance8x8 vp8_variance8x8_wmt
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse);
-#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt
-
-void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
-#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c
-
-void vp8_rtcd(void);
-
-#ifdef RTCD_C
-#include "vpx_ports/x86.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = x86_simd_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/x86/vp9_rtcd.h b/x86/vp9_rtcd.h
deleted file mode 100644
index 1dfd1ab..0000000
--- a/x86/vp9_rtcd.h
+++ /dev/null

@@ -1,848 +0,0 @@
-#ifndef VP9_RTCD_H_
-#define VP9_RTCD_H_
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-struct search_site_config;
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_b vp9_blend_b_c
-
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_inner vp9_blend_mb_inner_c
-
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp9_blend_mb_outer vp9_blend_mb_outer_c
-
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_sse2
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_sse2
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_sse2
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_sse2
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_sse2
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_sse2
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_sse2
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_avg vp9_convolve_avg_sse2
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve_copy vp9_convolve_copy_sse2
-
-void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
-
-void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
-
-void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-
-void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
-
-void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
-
-void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
-
-void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-
-void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
-
-void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
-
-void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
-
-void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
-
-void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
-
-void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c
-
-void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c
-
-void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c
-
-void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c
-
-void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
-
-void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
-
-void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-
-void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
-
-void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
-
-void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
-
-void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-
-void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
-
-void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
-
-void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
-
-void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
-
-void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
-
-void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
-
-void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
-
-void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
-
-void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
-
-void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2
-
-void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2
-
-void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse
-
-void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse
-
-void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
-
-void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
-
-void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
-
-void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
-
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16 vp9_fdct16x16_sse2
-
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct16x16_1 vp9_fdct16x16_1_sse2
-
-void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32 vp9_fdct32x32_sse2
-
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_1 vp9_fdct32x32_1_sse2
-
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2
-
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4 vp9_fdct4x4_sse2
-
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct4x4_1 vp9_fdct4x4_1_sse2
-
-void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8 vp9_fdct8x8_sse2
-
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride);
-#define vp9_fdct8x8_1 vp9_fdct8x8_1_sse2
-
-void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_sse2
-
-void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_sse2
-
-void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type);
-void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_sse2
-
-int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_full_range_search vp9_full_range_search_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride);
-void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_mmx
-
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get16x16var vp9_get16x16var_sse2
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vp9_get8x8var vp9_get8x8var_sse2
-
-unsigned int vp9_get_mb_ss_c(const int16_t *);
-unsigned int vp9_get_mb_ss_mmx(const int16_t *);
-unsigned int vp9_get_mb_ss_sse2(const int16_t *);
-#define vp9_get_mb_ss vp9_get_mb_ss_sse2
-
-void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
-
-void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
-
-void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-
-void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
-
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_10_add vp9_idct16x16_10_add_sse2
-
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_1_add vp9_idct16x16_1_add_sse2
-
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct16x16_256_add vp9_idct16x16_256_add_sse2
-
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_sse2
-
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2
-
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_sse2
-
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2
-
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2
-
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_12_add vp9_idct8x8_12_add_sse2
-
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2
-
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct8x8_64_add vp9_idct8x8_64_add_sse2
-
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
-
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2
-
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
-
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c
-
-void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c
-
-void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_sse2
-
-void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_mmx
-
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_sse2
-
-void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_sse2
-
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_sse2
-
-void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_sse2
-
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_sse2
-
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_mmx
-
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_sse2
-
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_sse2
-
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
-#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x16 vp9_mse16x16_sse2
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse16x8 vp9_mse16x8_c
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x16 vp9_mse8x16_c
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vp9_mse8x8 vp9_mse8x8_c
-
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b vp9_quantize_b_c
-
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c
-
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_c
-
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
-
-int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_refining_search_sad vp9_refining_search_sad_c
-
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x16 vp9_sad16x16_sse2
-
-unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x3 vp9_sad16x16x3_c
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x16x8 vp9_sad16x16x8_c
-
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad16x32 vp9_sad16x32_sse2
-
-unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad16x8 vp9_sad16x8_sse2
-
-unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x3 vp9_sad16x8x3_c
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad16x8x8 vp9_sad16x8x8_c
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x16 vp9_sad32x16_sse2
-
-unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2
-
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad32x32 vp9_sad32x32_sse2
-
-unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x3 vp9_sad32x32x3_c
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad32x32x8 vp9_sad32x32x8_c
-
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad32x64 vp9_sad32x64_sse2
-
-unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad4x4 vp9_sad4x4_sse
-
-unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x4_avg vp9_sad4x4_avg_sse
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x3 vp9_sad4x4x3_c
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad4x4x4d vp9_sad4x4x4d_sse
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad4x4x8 vp9_sad4x4x8_c
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad4x8 vp9_sad4x8_sse
-
-unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad4x8_avg vp9_sad4x8_avg_sse
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad4x8x4d vp9_sad4x8x4d_sse
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad4x8x8 vp9_sad4x8x8_c
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad64x32 vp9_sad64x32_sse2
-
-unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2
-
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad64x64 vp9_sad64x64_sse2
-
-unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x3 vp9_sad64x64x3_c
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad64x64x8 vp9_sad64x64x8_c
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x16 vp9_sad8x16_sse2
-
-unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x3 vp9_sad8x16x3_c
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x16x8 vp9_sad8x16x8_c
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vp9_sad8x4 vp9_sad8x4_sse2
-
-unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array);
-#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vp9_sad8x4x8 vp9_sad8x4x8_c
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride);
-#define vp9_sad8x8 vp9_sad8x8_sse2
-
-unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred);
-#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x3 vp9_sad8x8x3_c
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array);
-#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array);
-#define vp9_sad8x8x8 vp9_sad8x8x8_c
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_sse2
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_sse2
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_sse2
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_sse2
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_sse2
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_sse2
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_sse
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_sse
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_sse2
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_sse2
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_sse2
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_sse2
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred);
-#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_sse2
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_sse2
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_sse2
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_sse2
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_sse2
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_sse2
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_sse2
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_sse
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_sse
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_sse2
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_sse2
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_sse2
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_sse2
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_sse2
-
-void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vp9_subtract_block vp9_subtract_block_sse2
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
-
-void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2
-
-void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
-
-void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse
-
-void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2
-
-void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2
-
-void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2
-
-void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse
-
-void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x16 vp9_variance16x16_sse2
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x32 vp9_variance16x32_sse2
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance16x8 vp9_variance16x8_sse2
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x16 vp9_variance32x16_sse2
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x32 vp9_variance32x32_sse2
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance32x64 vp9_variance32x64_sse2
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x4 vp9_variance4x4_sse2
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance4x8 vp9_variance4x8_sse2
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x32 vp9_variance64x32_sse2
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance64x64 vp9_variance64x64_sse2
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x16 vp9_variance8x16_sse2
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x4 vp9_variance8x4_sse2
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vp9_variance8x8 vp9_variance8x8_sse2
-
-void vp9_rtcd(void);
-
-#ifdef RTCD_C
-#include "vpx_ports/x86.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = x86_simd_caps();
-
-    (void)flags;
-
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif

diff --git a/x86/vpx_config.asm b/x86/vpx_config.asm
deleted file mode 100644
index 56edc66..0000000
--- a/x86/vpx_config.asm
+++ /dev/null

@@ -1,85 +0,0 @@
-ARCH_ARM equ 0
-ARCH_MIPS equ 0
-ARCH_X86 equ 1
-ARCH_X86_64 equ 0
-ARCH_PPC32 equ 0
-ARCH_PPC64 equ 0
-HAVE_EDSP equ 0
-HAVE_MEDIA equ 0
-HAVE_NEON equ 0
-HAVE_NEON_ASM equ 0
-HAVE_MIPS32 equ 0
-HAVE_DSPR2 equ 0
-HAVE_MMX equ 1
-HAVE_SSE equ 1
-HAVE_SSE2 equ 1
-HAVE_SSE3 equ 0
-HAVE_SSSE3 equ 0
-HAVE_SSE4_1 equ 0
-HAVE_AVX equ 0
-HAVE_AVX2 equ 0
-HAVE_ALTIVEC equ 0
-HAVE_VPX_PORTS equ 1
-HAVE_STDINT_H equ 1
-HAVE_ALT_TREE_LAYOUT equ 0
-HAVE_PTHREAD_H equ 1
-HAVE_SYS_MMAN_H equ 1
-HAVE_UNISTD_H equ 1
-CONFIG_EXTERNAL_BUILD equ 0
-CONFIG_INSTALL_DOCS equ 1
-CONFIG_INSTALL_BINS equ 1
-CONFIG_INSTALL_LIBS equ 1
-CONFIG_INSTALL_SRCS equ 0
-CONFIG_USE_X86INC equ 1
-CONFIG_DEBUG equ 0
-CONFIG_GPROF equ 0
-CONFIG_GCOV equ 0
-CONFIG_RVCT equ 0
-CONFIG_GCC equ 1
-CONFIG_MSVS equ 0
-CONFIG_PIC equ 1
-CONFIG_BIG_ENDIAN equ 0
-CONFIG_CODEC_SRCS equ 0
-CONFIG_DEBUG_LIBS equ 0
-CONFIG_FAST_UNALIGNED equ 1
-CONFIG_MEM_MANAGER equ 0
-CONFIG_MEM_TRACKER equ 0
-CONFIG_MEM_CHECKS equ 0
-CONFIG_DEQUANT_TOKENS equ 0
-CONFIG_DC_RECON equ 0
-CONFIG_RUNTIME_CPU_DETECT equ 0
-CONFIG_POSTPROC equ 1
-CONFIG_VP9_POSTPROC equ 0
-CONFIG_MULTITHREAD equ 1
-CONFIG_INTERNAL_STATS equ 0
-CONFIG_VP8_ENCODER equ 1
-CONFIG_VP8_DECODER equ 1
-CONFIG_VP9_ENCODER equ 1
-CONFIG_VP9_DECODER equ 1
-CONFIG_VP8 equ 1
-CONFIG_VP9 equ 1
-CONFIG_ENCODERS equ 1
-CONFIG_DECODERS equ 1
-CONFIG_STATIC_MSVCRT equ 0
-CONFIG_SPATIAL_RESAMPLING equ 1
-CONFIG_REALTIME_ONLY equ 1
-CONFIG_ONTHEFLY_BITPACKING equ 0
-CONFIG_ERROR_CONCEALMENT equ 0
-CONFIG_SHARED equ 0
-CONFIG_STATIC equ 1
-CONFIG_SMALL equ 0
-CONFIG_POSTPROC_VISUALIZER equ 0
-CONFIG_OS_SUPPORT equ 1
-CONFIG_UNIT_TESTS equ 0
-CONFIG_WEBM_IO equ 1
-CONFIG_LIBYUV equ 1
-CONFIG_DECODE_PERF_TESTS equ 0
-CONFIG_ENCODE_PERF_TESTS equ 0
-CONFIG_MULTI_RES_ENCODING equ 0
-CONFIG_TEMPORAL_DENOISING equ 1
-CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_EXPERIMENTAL equ 0
-CONFIG_SIZE_LIMIT equ 0
-CONFIG_SPATIAL_SVC equ 0
-CONFIG_VP9_TEMPORAL_DENOISING equ 0
-CONFIG_FP_MB_STATS equ 0

diff --git a/x86/vpx_config.c b/x86/vpx_config.c
deleted file mode 100644
index d212256..0000000
--- a/x86/vpx_config.c
+++ /dev/null

@@ -1,9 +0,0 @@
-/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
-/*  */
-/* Use of this source code is governed by a BSD-style license */
-/* that can be found in the LICENSE file in the root of the source */
-/* tree. An additional intellectual property rights grant can be found */
-/* in the file PATENTS.  All contributing project authors may */
-/* be found in the AUTHORS file in the root of the source tree. */
-static const char* const cfg = "--force-target=x86-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/vigneshv/Downloads/android-ndk-r10 --disable-sse3 --disable-ssse3 --disable-sse4_1 --disable-avx --disable-avx2 --enable-pic --disable-examples --disable-docs --enable-realtime-only";
-const char *vpx_codec_build_config(void) {return cfg;}
commit	0b489d9457b668909fb5121ffad12c8bbd03281d	[log] [tgz]
author	Vignesh Venkatasubramanian <vigneshv@google.com>	Tue Oct 20 04:07:27 2015 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	Tue Oct 20 04:07:27 2015 +0000
tree	9a0513346e6332db05ed73f42998d53f69410106
parent	cc274e2abe8b2a6698a5c47d8aa4bb45f1f9538d [diff]
parent	7ce0a1d1337c01056ba24006efab21f00e179e04 [diff]